//! Host pipeline that consumes PM4 `DRAW_INDX*` captures. //! //! Drives [`xenia_gpu::shaders::XENOS_INTERP_WGSL`]. This file owns the //! wgpu bind-group + pipeline + buffer surface the Xenos WGSL interpreter //! binds to. The WGSL module is expected to declare: //! //! ```text //! @group(0) @binding(0) var xenos_draw : XenosDrawConstants; // 16 B //! @group(0) @binding(1) var xenos_consts : XenosConstants; // ~9.2 KB //! @group(0) @binding(2) var vs_ucode : array; //! @group(0) @binding(3) var ps_ucode : array; //! @group(0) @binding(4) var vertex_buffer : array; //! ``` //! //! Texture bindings (M6) are a single-slot stub for P3b: //! //! ```text //! @group(1) @binding(0) var xenos_tex : texture_2d; //! @group(1) @binding(1) var xenos_samp : sampler; //! ``` //! //! The bound texture is a 1×1 magenta placeholder. Real per-slot guest //! texture uploads + format decode land with the texture cache (P5). use bytemuck::{Pod, Zeroable}; use wgpu::util::DeviceExt; use xenia_gpu::shaders::XENOS_INTERP_WGSL; use xenia_gpu::xenos_constants::XenosConstantsBlock; /// Per-draw constants mirroring the WGSL `XenosDrawConstants` uniform /// block. Ordering / padding matches `xenos_interp.wgsl` exactly. #[repr(C)] #[derive(Clone, Copy, Pod, Zeroable)] struct DrawConstants { draw_index: u32, vertex_count: u32, prim_kind: u32, _pad: u32, } /// Submitted to [`XenosPipeline::render_one`] to render one captured draw. #[derive(Clone, Copy, Debug)] pub struct DrawRequest { /// Monotonic draw counter; shader uses it for per-draw colour rotation. pub draw_index: u32, /// Host-normalised vertex count (after primitive-processor rewrite). pub vertex_count: u32, /// Xenos primitive-type code; shader may branch on it in P3b+. pub prim_kind: u32, } /// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360 /// shaders are ≪ 4 KB; 64 KB is orders-of-magnitude slack. const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS /// 16 MB of vertex data — enough for any realistic Xenos draw. const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024; pub struct XenosPipeline { pipeline: wgpu::RenderPipeline, draw_ctx_buffer: wgpu::Buffer, constants_buffer: wgpu::Buffer, vs_ucode_buffer: wgpu::Buffer, ps_ucode_buffer: wgpu::Buffer, vertex_buffer: wgpu::Buffer, bind_group: wgpu::BindGroup, /// P5: swapped per-draw when a new cached texture becomes active. tex_bind_group: wgpu::BindGroup, /// Layout + sampler retained so `set_texture_view` can rebuild /// `tex_bind_group` on the fly without re-reading the pipeline. tex_bgl: wgpu::BindGroupLayout, sampler: wgpu::Sampler, /// Fallback 1×1 magenta texture — used when no guest texture has been /// uploaded yet or when a draw references an unsupported format. dummy_view: wgpu::TextureView, /// P7 — retained pipeline layout + compiled-pipeline cache for /// Xenos→WGSL translator output. Keyed on `(vs_blob_key, ps_blob_key)` /// so every (vs, ps) pair gets compiled once and re-used for every /// subsequent draw. Interpreter pipeline remains the fallback. pipeline_layout: wgpu::PipelineLayout, translated_cache: std::collections::HashMap<(u32, u32), wgpu::RenderPipeline>, pub target_format: wgpu::TextureFormat, } impl XenosPipeline { pub fn new( device: &wgpu::Device, queue: &wgpu::Queue, target_format: wgpu::TextureFormat, ) -> Self { let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { label: Some("xenos_interp.wgsl"), source: wgpu::ShaderSource::Wgsl(XENOS_INTERP_WGSL.into()), }); let bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { label: Some("xenos bind group layout"), entries: &[ // b0: draw_ctx (16 B uniform) wgpu::BindGroupLayoutEntry { binding: 0, visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Uniform, has_dynamic_offset: false, min_binding_size: std::num::NonZeroU64::new( std::mem::size_of::() as u64, ), }, count: None, }, // b1: XenosConstants read-only storage (~9.2 KB). Not uniform // because the block contains packed `array` fields and // WGSL's uniform address space would require 16-byte stride. wgpu::BindGroupLayoutEntry { binding: 1, visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: true }, has_dynamic_offset: false, min_binding_size: std::num::NonZeroU64::new( XenosConstantsBlock::SIZE as u64, ), }, count: None, }, // b2: vs_ucode (read-only storage) wgpu::BindGroupLayoutEntry { binding: 2, visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: true }, has_dynamic_offset: false, min_binding_size: None, }, count: None, }, // b3: ps_ucode (read-only storage) wgpu::BindGroupLayoutEntry { binding: 3, visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: true }, has_dynamic_offset: false, min_binding_size: None, }, count: None, }, // b4: vertex_buffer (read-only storage) wgpu::BindGroupLayoutEntry { binding: 4, visibility: wgpu::ShaderStages::VERTEX, ty: wgpu::BindingType::Buffer { ty: wgpu::BufferBindingType::Storage { read_only: true }, has_dynamic_offset: false, min_binding_size: None, }, count: None, }, ], }); let tex_bgl = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { label: Some("xenos tex bind group layout"), entries: &[ wgpu::BindGroupLayoutEntry { binding: 0, visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, ty: wgpu::BindingType::Texture { sample_type: wgpu::TextureSampleType::Float { filterable: true }, view_dimension: wgpu::TextureViewDimension::D2, multisampled: false, }, count: None, }, wgpu::BindGroupLayoutEntry { binding: 1, visibility: wgpu::ShaderStages::VERTEX | wgpu::ShaderStages::FRAGMENT, ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering), count: None, }, ], }); let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { label: Some("xenos pipeline layout"), bind_group_layouts: &[&bgl, &tex_bgl], push_constant_ranges: &[], }); // Buffer allocation. `queue.write_buffer` uses COPY_DST; all // interpreter-facing buffers need it. let initial_draw = DrawConstants { draw_index: 0, vertex_count: 3, prim_kind: 4, _pad: 0, }; let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { label: Some("xenos draw ctx"), contents: bytemuck::bytes_of(&initial_draw), usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, }); let constants_buffer = device.create_buffer(&wgpu::BufferDescriptor { label: Some("xenos constants"), size: XenosConstantsBlock::SIZE as u64, usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, mapped_at_creation: false, }); let vs_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor { label: Some("xenos vs ucode"), size: UCODE_BUFFER_MAX_DWORDS * 4, usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, mapped_at_creation: false, }); let ps_ucode_buffer = device.create_buffer(&wgpu::BufferDescriptor { label: Some("xenos ps ucode"), size: UCODE_BUFFER_MAX_DWORDS * 4, usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, mapped_at_creation: false, }); let vertex_buffer = device.create_buffer(&wgpu::BufferDescriptor { label: Some("xenos vertex buffer"), size: VERTEX_BUFFER_MAX_BYTES, usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, mapped_at_creation: false, }); // Dummy 1×1 magenta texture — placeholder until P5's texture cache // lands. Every `interpret_texture_fetch` samples this for now so the // interpreter can exercise textureSample paths without a real cache. let dummy_tex = device.create_texture(&wgpu::TextureDescriptor { label: Some("xenos dummy texture"), size: wgpu::Extent3d { width: 1, height: 1, depth_or_array_layers: 1, }, mip_level_count: 1, sample_count: 1, dimension: wgpu::TextureDimension::D2, format: wgpu::TextureFormat::Rgba8Unorm, usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST, view_formats: &[], }); // Magenta (255, 0, 255, 255) so a missing-texture read visibly stands // out on-screen when the interpreter does sample it. queue.write_texture( wgpu::ImageCopyTexture { texture: &dummy_tex, mip_level: 0, origin: wgpu::Origin3d::ZERO, aspect: wgpu::TextureAspect::All, }, &[0xFFu8, 0x00, 0xFF, 0xFF], wgpu::ImageDataLayout { offset: 0, bytes_per_row: Some(4), rows_per_image: Some(1), }, wgpu::Extent3d { width: 1, height: 1, depth_or_array_layers: 1, }, ); let dummy_view = dummy_tex.create_view(&wgpu::TextureViewDescriptor::default()); let dummy_sampler = device.create_sampler(&wgpu::SamplerDescriptor { label: Some("xenos dummy sampler"), address_mode_u: wgpu::AddressMode::ClampToEdge, address_mode_v: wgpu::AddressMode::ClampToEdge, address_mode_w: wgpu::AddressMode::ClampToEdge, mag_filter: wgpu::FilterMode::Linear, min_filter: wgpu::FilterMode::Linear, mipmap_filter: wgpu::FilterMode::Nearest, ..Default::default() }); let tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { label: Some("xenos tex bind group"), layout: &tex_bgl, entries: &[ wgpu::BindGroupEntry { binding: 0, resource: wgpu::BindingResource::TextureView(&dummy_view), }, wgpu::BindGroupEntry { binding: 1, resource: wgpu::BindingResource::Sampler(&dummy_sampler), }, ], }); let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { label: Some("xenos bind group"), layout: &bgl, entries: &[ wgpu::BindGroupEntry { binding: 0, resource: draw_ctx_buffer.as_entire_binding(), }, wgpu::BindGroupEntry { binding: 1, resource: constants_buffer.as_entire_binding(), }, wgpu::BindGroupEntry { binding: 2, resource: vs_ucode_buffer.as_entire_binding(), }, wgpu::BindGroupEntry { binding: 3, resource: ps_ucode_buffer.as_entire_binding(), }, wgpu::BindGroupEntry { binding: 4, resource: vertex_buffer.as_entire_binding(), }, ], }); let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { label: Some("xenos_interp pipeline"), layout: Some(&layout), vertex: wgpu::VertexState { module: &shader, entry_point: "vs_main", compilation_options: Default::default(), buffers: &[], }, fragment: Some(wgpu::FragmentState { module: &shader, entry_point: "fs_main", compilation_options: Default::default(), targets: &[Some(wgpu::ColorTargetState { format: target_format, blend: Some(wgpu::BlendState { color: wgpu::BlendComponent { src_factor: wgpu::BlendFactor::SrcAlpha, dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha, operation: wgpu::BlendOperation::Add, }, alpha: wgpu::BlendComponent::OVER, }), write_mask: wgpu::ColorWrites::ALL, })], }), primitive: wgpu::PrimitiveState { topology: wgpu::PrimitiveTopology::TriangleList, strip_index_format: None, front_face: wgpu::FrontFace::Ccw, cull_mode: None, polygon_mode: wgpu::PolygonMode::Fill, unclipped_depth: false, conservative: false, }, depth_stencil: None, multisample: wgpu::MultisampleState::default(), multiview: None, cache: None, }); Self { pipeline, draw_ctx_buffer, constants_buffer, vs_ucode_buffer, ps_ucode_buffer, vertex_buffer, bind_group, tex_bind_group, tex_bgl, sampler: dummy_sampler, dummy_view, pipeline_layout: layout, translated_cache: std::collections::HashMap::new(), target_format, } } /// P7 — does the translator cache already have a pipeline for this /// (vs, ps) pair? pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool { self.translated_cache .contains_key(&(vs_blob_key, ps_blob_key)) } /// P7 — fetch a cached translator pipeline. `None` if not yet built. pub fn translated_pipeline( &self, vs_blob_key: u32, ps_blob_key: u32, ) -> Option<&wgpu::RenderPipeline> { self.translated_cache .get(&(vs_blob_key, ps_blob_key)) } /// P7 — compile a translator-produced WGSL module into a /// `wgpu::RenderPipeline` and insert it into the cache keyed on /// `(vs_blob_key, ps_blob_key)`. Returns `true` on success. Duplicate /// inserts are no-ops. Emits `gpu.shader.compile_ok` on success. pub fn insert_translated( &mut self, device: &wgpu::Device, vs_blob_key: u32, ps_blob_key: u32, wgsl: &str, ) -> bool { let key = (vs_blob_key, ps_blob_key); if self.translated_cache.contains_key(&key) { return true; } let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { device.create_shader_module(wgpu::ShaderModuleDescriptor { label: Some("xenos translated module"), source: wgpu::ShaderSource::Wgsl(wgsl.to_string().into()), }) })) { Ok(m) => m, Err(_) => { metrics::counter!("gpu.shader.compile_err", "stage" => "module") .increment(1); return false; } }; let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { label: Some("xenos translated pipeline"), layout: Some(&self.pipeline_layout), vertex: wgpu::VertexState { module: &shader, entry_point: "vs_main", compilation_options: Default::default(), buffers: &[], }, fragment: Some(wgpu::FragmentState { module: &shader, entry_point: "fs_main", compilation_options: Default::default(), targets: &[Some(wgpu::ColorTargetState { format: self.target_format, blend: Some(wgpu::BlendState { color: wgpu::BlendComponent { src_factor: wgpu::BlendFactor::SrcAlpha, dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha, operation: wgpu::BlendOperation::Add, }, alpha: wgpu::BlendComponent::OVER, }), write_mask: wgpu::ColorWrites::ALL, })], }), primitive: wgpu::PrimitiveState { topology: wgpu::PrimitiveTopology::TriangleList, strip_index_format: None, front_face: wgpu::FrontFace::Ccw, cull_mode: None, polygon_mode: wgpu::PolygonMode::Fill, unclipped_depth: false, conservative: false, }, depth_stencil: None, multisample: wgpu::MultisampleState::default(), multiview: None, cache: None, }); self.translated_cache.insert(key, pipeline); metrics::counter!("gpu.shader.compile_ok").increment(1); true } /// Render one draw with the translator-produced pipeline instead of /// the interpreter. Mirrors [`render_one`] except the bound pipeline /// is swapped for `pipeline`. pub fn render_one_with_pipeline( &self, queue: &wgpu::Queue, encoder: &mut wgpu::CommandEncoder, target_view: &wgpu::TextureView, req: DrawRequest, pipeline: &wgpu::RenderPipeline, ) { let cb = DrawConstants { draw_index: req.draw_index, vertex_count: req.vertex_count.max(3), prim_kind: req.prim_kind, _pad: 0, }; queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb)); let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { label: Some("xenos translated draw"), color_attachments: &[Some(wgpu::RenderPassColorAttachment { view: target_view, resolve_target: None, ops: wgpu::Operations { load: wgpu::LoadOp::Load, store: wgpu::StoreOp::Store, }, })], depth_stencil_attachment: None, timestamp_writes: None, occlusion_query_set: None, }); pass.set_pipeline(pipeline); pass.set_bind_group(0, &self.bind_group, &[]); pass.set_bind_group(1, &self.tex_bind_group, &[]); let rounded = req.vertex_count.div_ceil(3) * 3; pass.draw(0..rounded.max(3), 0..1); } /// Number of distinct translator pipelines cached. Surfaced to the HUD. pub fn translated_pipeline_count(&self) -> usize { self.translated_cache.len() } /// P5 — swap the active texture bound at `@group(1) @binding(0)`. /// `view` is typically a wgpu texture view obtained from the /// [`TextureCacheHost`]. Pass `None` to revert to the built-in dummy /// magenta stub. pub fn set_texture_view(&mut self, device: &wgpu::Device, view: Option<&wgpu::TextureView>) { let bound = view.unwrap_or(&self.dummy_view); self.tex_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { label: Some("xenos tex bind group (rebind)"), layout: &self.tex_bgl, entries: &[ wgpu::BindGroupEntry { binding: 0, resource: wgpu::BindingResource::TextureView(bound), }, wgpu::BindGroupEntry { binding: 1, resource: wgpu::BindingResource::Sampler(&self.sampler), }, ], }); } /// Clear `target_view` to `color`, store. pub fn clear( &self, encoder: &mut wgpu::CommandEncoder, target_view: &wgpu::TextureView, color: [f64; 4], ) { let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { label: Some("xenos frontbuffer clear"), color_attachments: &[Some(wgpu::RenderPassColorAttachment { view: target_view, resolve_target: None, ops: wgpu::Operations { load: wgpu::LoadOp::Clear(wgpu::Color { r: color[0], g: color[1], b: color[2], a: color[3], }), store: wgpu::StoreOp::Store, }, })], depth_stencil_attachment: None, timestamp_writes: None, occlusion_query_set: None, }); let _ = &mut pass; } /// Upload shader microcode + constants once (before the batch of draws /// that share them). Skips zero-length blobs. pub fn upload_shader_and_constants( &self, queue: &wgpu::Queue, vs_ucode: &[u32], ps_ucode: &[u32], constants: &XenosConstantsBlock, ) { queue.write_buffer(&self.constants_buffer, 0, bytemuck::bytes_of(constants)); if !vs_ucode.is_empty() { let bytes: &[u8] = bytemuck::cast_slice(vs_ucode); let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize; queue.write_buffer(&self.vs_ucode_buffer, 0, &bytes[..bytes.len().min(max)]); } if !ps_ucode.is_empty() { let bytes: &[u8] = bytemuck::cast_slice(ps_ucode); let max = (UCODE_BUFFER_MAX_DWORDS * 4) as usize; queue.write_buffer(&self.ps_ucode_buffer, 0, &bytes[..bytes.len().min(max)]); } } /// Upload vertex data (as raw big-endian dwords — the WGSL side will /// bswap as needed during format unpacking). pub fn upload_vertex_data(&self, queue: &wgpu::Queue, data: &[u32]) { if data.is_empty() { return; } let bytes: &[u8] = bytemuck::cast_slice(data); let max = VERTEX_BUFFER_MAX_BYTES as usize; queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]); } /// Render one captured draw. pub fn render_one( &self, queue: &wgpu::Queue, encoder: &mut wgpu::CommandEncoder, target_view: &wgpu::TextureView, req: DrawRequest, ) { let cb = DrawConstants { draw_index: req.draw_index, vertex_count: req.vertex_count.max(3), prim_kind: req.prim_kind, _pad: 0, }; queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb)); let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { label: Some("xenos draw"), color_attachments: &[Some(wgpu::RenderPassColorAttachment { view: target_view, resolve_target: None, ops: wgpu::Operations { load: wgpu::LoadOp::Load, store: wgpu::StoreOp::Store, }, })], depth_stencil_attachment: None, timestamp_writes: None, occlusion_query_set: None, }); pass.set_pipeline(&self.pipeline); pass.set_bind_group(0, &self.bind_group, &[]); pass.set_bind_group(1, &self.tex_bind_group, &[]); let rounded = req.vertex_count.div_ceil(3) * 3; pass.draw(0..rounded.max(3), 0..1); } } #[cfg(test)] mod tests { use super::*; #[test] fn draw_constants_layout_matches_wgsl_uniform() { assert_eq!(std::mem::size_of::(), 16); } }