diff --git a/crates/xenia-gpu/src/draw_capture.rs b/crates/xenia-gpu/src/draw_capture.rs index 0f73e88..979dc23 100644 --- a/crates/xenia-gpu/src/draw_capture.rs +++ b/crates/xenia-gpu/src/draw_capture.rs @@ -73,6 +73,22 @@ pub struct DrawCapture { /// magenta stub. Empty for flat (no-tfetch) draws. Populated by /// `gpu_system` after decode (left empty by `build`). pub textures: Vec<(crate::texture_cache::TextureKey, Vec)>, + /// iterate-3Y: per-draw color/blend render state captured from the + /// register file so the host pipeline composites the way the guest + /// intends (instead of one fixed alpha-blend state). Mirrors the fields + /// canary feeds into `GetCurrentStateDescription` (D3D12 + /// `pipeline_cache.cc`): + /// * `blend_control` = `RB_BLENDCONTROL0` (RT0 src/dst factors + op, + /// color and alpha). The Xbox 360 has no separate "blend enable" bit; + /// `One,Zero,Add` *is* the opaque case. + /// * `color_mask` = RT0 nibble of `RB_COLOR_MASK` (per-channel write + /// enable). When 0, canary forces `One,Zero` (no blend). + /// * `color_control` = `RB_COLORCONTROL` (alpha-test enable/func). + /// * `depth_control` = `RB_DEPTHCONTROL` (z-test enable/func/write). + pub blend_control: u32, + pub color_mask: u8, + pub color_control: u32, + pub depth_control: u32, } /// iterate-3S: compute the guest→host NDC XY transform for a draw, mirroring @@ -309,6 +325,13 @@ pub fn build( None => (Vec::new(), 0, false), }; let (ndc_scale, ndc_offset) = compute_ndc_xy(rf); + // iterate-3Y: capture RT0 color/blend/depth render state. Registers per + // canary `registers.h`: RB_BLENDCONTROL0=0x2201, RB_COLOR_MASK=0x2104 + // (RT0 = bits[3:0]), RB_COLORCONTROL=0x2202, RB_DEPTHCONTROL=0x2200. + const RB_BLENDCONTROL_0: u32 = 0x2201; + const RB_COLOR_MASK: u32 = 0x2104; + const RB_COLORCONTROL: u32 = 0x2202; + const RB_DEPTHCONTROL: u32 = 0x2200; DrawCapture { draw_index, prim_code: prim_code(primitive), @@ -321,5 +344,9 @@ pub fn build( ndc_scale, ndc_offset, textures: Vec::new(), + blend_control: rf.read(RB_BLENDCONTROL_0), + color_mask: (rf.read(RB_COLOR_MASK) & 0xF) as u8, + color_control: rf.read(RB_COLORCONTROL), + depth_control: rf.read(RB_DEPTHCONTROL), } } diff --git a/crates/xenia-ui/src/render.rs b/crates/xenia-ui/src/render.rs index 47c07ea..921789a 100644 --- a/crates/xenia-ui/src/render.rs +++ b/crates/xenia-ui/src/render.rs @@ -668,25 +668,32 @@ impl RenderState { ndc_scale: [0.0, 0.0], ndc_offset: [0.0, 0.0], }; + // Synthetic visualizer path (legacy): no captured render state, so + // use the opaque default. + let rstate = crate::xenos_pipeline::RenderState::OPAQUE; if use_translated - && let Some(p) = self.xenos_pipeline.translated_pipeline(vs_key, ps_key) { - self.xenos_pipeline.render_one_with_pipeline( - &self.queue, - &mut encoder, - &self.frontbuffer_view, - req, - p, - ); - metrics::counter!("gpu.shader.use", "path" => "translator") - .increment(1); - served_translator += 1; - continue; - } + && self.xenos_pipeline.render_one_translated( + &self.device, + &self.queue, + &mut encoder, + &self.frontbuffer_view, + req, + vs_key, + ps_key, + rstate, + ) + { + metrics::counter!("gpu.shader.use", "path" => "translator").increment(1); + served_translator += 1; + continue; + } self.xenos_pipeline.render_one( + &self.device, &self.queue, &mut encoder, &self.frontbuffer_view, req, + rstate, ); metrics::counter!("gpu.shader.use", "path" => "interpreter").increment(1); served_interpreter += 1; @@ -836,29 +843,41 @@ impl RenderState { ndc_scale: if cap.has_real_vertices { cap.ndc_scale } else { [0.0, 0.0] }, ndc_offset: if cap.has_real_vertices { cap.ndc_offset } else { [0.0, 0.0] }, }; + // iterate-3Y: replay this draw's real color/blend/write-mask state + // (captured from `RB_BLENDCONTROL0` / `RB_COLOR_MASK`) so overlays + // composite the way the guest intends instead of opaquely + // overwriting the logo. + let rstate = crate::xenos_pipeline::RenderState { + blend_control: cap.blend_control, + color_mask: cap.color_mask, + }; let mut encoder = self .device .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: Some("xenos capture replay (per-draw)"), }); - if use_translated - && let Some(p) = self.xenos_pipeline.translated_pipeline(cap.vs_key, cap.ps_key) - { - self.xenos_pipeline.render_one_with_pipeline( + let served_translated = use_translated + && self.xenos_pipeline.render_one_translated( + &self.device, &self.queue, &mut encoder, &self.frontbuffer_view, req, - p, + cap.vs_key, + cap.ps_key, + rstate, ); + if served_translated { self.xenos_dispatches_translator = self.xenos_dispatches_translator.saturating_add(1); } else { self.xenos_pipeline.render_one( + &self.device, &self.queue, &mut encoder, &self.frontbuffer_view, req, + rstate, ); self.xenos_dispatches_interpreter = self.xenos_dispatches_interpreter.saturating_add(1); diff --git a/crates/xenia-ui/src/xenos_pipeline.rs b/crates/xenia-ui/src/xenos_pipeline.rs index f22373f..99d5782 100644 --- a/crates/xenia-ui/src/xenos_pipeline.rs +++ b/crates/xenia-ui/src/xenos_pipeline.rs @@ -46,6 +46,134 @@ struct DrawConstants { ndc_offset: [f32; 2], } +/// iterate-3Y: the per-draw host color/blend/write-mask render state, decoded +/// from the guest registers (`RB_BLENDCONTROL0` / `RB_COLOR_MASK`). Used both +/// as part of the pipeline-cache key and to build the `wgpu::ColorTargetState`. +/// Mirrors canary's `GetColorBlendStateForRenderTarget` (D3D12 +/// `pipeline_cache.cc`): the factors come straight from `RB_BLENDCONTROL`, +/// and a zero write-mask forces the no-blend `One,Zero` equation. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub struct RenderState { + /// `RB_BLENDCONTROL0` raw value (RT0). `0x00010001` (One,Zero / One,Zero, + /// Add) is the opaque case. + pub blend_control: u32, + /// RT0 nibble of `RB_COLOR_MASK` (bit0=R … bit3=A). 0 = write nothing. + pub color_mask: u8, +} + +impl RenderState { + /// Fully-opaque, all-channels state (the legacy fixed behaviour). Used for + /// procedural/synthetic draws that have no captured guest state. + pub const OPAQUE: RenderState = RenderState { + blend_control: 0x0001_0001, + color_mask: 0xF, + }; + + /// Map a Xenos `BlendFactor` (5-bit field) to a wgpu `BlendFactor`, + /// mirroring canary `kBlendFactorMap` (D3D12 `pipeline_cache.cc:1504`). + fn map_factor(f: u32) -> wgpu::BlendFactor { + match f { + 0 => wgpu::BlendFactor::Zero, + 1 => wgpu::BlendFactor::One, + 4 => wgpu::BlendFactor::Src, + 5 => wgpu::BlendFactor::OneMinusSrc, + 6 => wgpu::BlendFactor::SrcAlpha, + 7 => wgpu::BlendFactor::OneMinusSrcAlpha, + 8 => wgpu::BlendFactor::Dst, + 9 => wgpu::BlendFactor::OneMinusDst, + 10 => wgpu::BlendFactor::DstAlpha, + 11 => wgpu::BlendFactor::OneMinusDstAlpha, + 12 => wgpu::BlendFactor::Constant, + 13 => wgpu::BlendFactor::OneMinusConstant, + 14 => wgpu::BlendFactor::Constant, + 15 => wgpu::BlendFactor::OneMinusConstant, + 16 => wgpu::BlendFactor::SrcAlphaSaturated, + // 2/3 and >16 are undefined on Xenos; canary maps to Zero. + _ => wgpu::BlendFactor::Zero, + } + } + + /// Map a Xenos `BlendFactor` for the *alpha* channel, mirroring canary + /// `kBlendFactorAlphaMap` (color-mode factors collapse to alpha). + fn map_factor_alpha(f: u32) -> wgpu::BlendFactor { + match f { + 4 => wgpu::BlendFactor::SrcAlpha, + 5 => wgpu::BlendFactor::OneMinusSrcAlpha, + 8 => wgpu::BlendFactor::DstAlpha, + 9 => wgpu::BlendFactor::OneMinusDstAlpha, + other => Self::map_factor(other), + } + } + + fn map_op(o: u32) -> wgpu::BlendOperation { + match o { + 0 => wgpu::BlendOperation::Add, + 1 => wgpu::BlendOperation::Subtract, + 2 => wgpu::BlendOperation::Min, + 3 => wgpu::BlendOperation::Max, + 4 => wgpu::BlendOperation::ReverseSubtract, + _ => wgpu::BlendOperation::Add, + } + } + + /// Build the `wgpu::ColorTargetState` for this draw. + fn color_target(&self, format: wgpu::TextureFormat) -> wgpu::ColorTargetState { + let bc = self.blend_control; + let color_src = bc & 0x1F; + let color_op = (bc >> 5) & 0x7; + let color_dst = (bc >> 8) & 0x1F; + let alpha_src = (bc >> 16) & 0x1F; + let alpha_op = (bc >> 21) & 0x7; + let alpha_dst = (bc >> 24) & 0x1F; + + // wgpu requires `blend: None` when nothing would be written; also the + // `One,Zero,Add` identity is the opaque case (canary's no-blend), which + // we express as `blend: None` so it's a plain overwrite. + let is_opaque = color_src == 1 + && color_dst == 0 + && color_op == 0 + && alpha_src == 1 + && alpha_dst == 0 + && alpha_op == 0; + let blend = if is_opaque { + None + } else { + Some(wgpu::BlendState { + color: wgpu::BlendComponent { + src_factor: Self::map_factor(color_src), + dst_factor: Self::map_factor(color_dst), + operation: Self::map_op(color_op), + }, + alpha: wgpu::BlendComponent { + src_factor: Self::map_factor_alpha(alpha_src), + dst_factor: Self::map_factor_alpha(alpha_dst), + operation: Self::map_op(alpha_op), + }, + }) + }; + + let mut write_mask = wgpu::ColorWrites::empty(); + if self.color_mask & 0x1 != 0 { + write_mask |= wgpu::ColorWrites::RED; + } + if self.color_mask & 0x2 != 0 { + write_mask |= wgpu::ColorWrites::GREEN; + } + if self.color_mask & 0x4 != 0 { + write_mask |= wgpu::ColorWrites::BLUE; + } + if self.color_mask & 0x8 != 0 { + write_mask |= wgpu::ColorWrites::ALPHA; + } + + wgpu::ColorTargetState { + format, + blend, + write_mask, + } + } +} + /// Submitted to [`XenosPipeline::render_one`] to render one captured draw. #[derive(Clone, Copy, Debug)] pub struct DrawRequest { @@ -71,7 +199,16 @@ const UCODE_BUFFER_MAX_DWORDS: u64 = 16 * 1024; // 64 KB each for VS & PS const VERTEX_BUFFER_MAX_BYTES: u64 = 16 * 1024 * 1024; pub struct XenosPipeline { + /// Interpreter pipeline with the legacy fixed (alpha-blend) state. Kept as + /// the default; per-state variants are built lazily in `interp_cache`. pipeline: wgpu::RenderPipeline, + /// iterate-3Y: the interpreter WGSL module, retained so per-render-state + /// interpreter pipelines can be compiled on demand. + interp_shader: wgpu::ShaderModule, + /// iterate-3Y: interpreter pipelines keyed on the per-draw `RenderState` + /// (blend + write mask), so flat/alpha/opaque draws composite correctly + /// even when their (vs,ps) didn't translate. + interp_cache: std::collections::HashMap, draw_ctx_buffer: wgpu::Buffer, constants_buffer: wgpu::Buffer, vs_ucode_buffer: wgpu::Buffer, @@ -92,7 +229,12 @@ pub struct XenosPipeline { /// so every (vs, ps) pair gets compiled once and re-used for every /// subsequent draw. Interpreter pipeline remains the fallback. pipeline_layout: wgpu::PipelineLayout, - translated_cache: std::collections::HashMap<(u32, u32), wgpu::RenderPipeline>, + /// iterate-3Y: cached translator pipelines keyed on the shader pair AND the + /// per-draw render state, so the same (vs,ps) with different blend/mask + /// composites correctly. The translated WGSL module is itself cached per + /// (vs,ps) so re-translation only happens once. + translated_cache: std::collections::HashMap<(u32, u32, RenderState), wgpu::RenderPipeline>, + translated_modules: std::collections::HashMap<(u32, u32), wgpu::ShaderModule>, pub target_format: wgpu::TextureFormat, } @@ -258,8 +400,13 @@ impl XenosPipeline { usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST, view_formats: &[], }); - // Magenta (255, 0, 255, 255) so a missing-texture read visibly stands - // out on-screen when the interpreter does sample it. + // iterate-3Y: transparent black (0,0,0,0). When a textured draw's + // real texture can't be resolved (e.g. its sampler slot is shadowed by + // a vertex-fetch constant), sampling a *transparent* texel makes the + // draw a no-op under its real premultiplied-alpha blend — instead of + // fabricating an opaque magenta that overpaints everything (the old + // debug stub). This removes a fake rather than adding one: we never + // invent visible pixels for an unresolved texture. queue.write_texture( wgpu::ImageCopyTexture { texture: &dummy_tex, @@ -267,7 +414,7 @@ impl XenosPipeline { origin: wgpu::Origin3d::ZERO, aspect: wgpu::TextureAspect::All, }, - &[0xFFu8, 0x00, 0xFF, 0xFF], + &[0x00u8, 0x00, 0x00, 0x00], wgpu::ImageDataLayout { offset: 0, bytes_per_row: Some(4), @@ -375,6 +522,8 @@ impl XenosPipeline { Self { pipeline, + interp_shader: shader, + interp_cache: std::collections::HashMap::new(), draw_ctx_buffer, constants_buffer, vs_ucode_buffer, @@ -387,31 +536,22 @@ impl XenosPipeline { dummy_view, pipeline_layout: layout, translated_cache: std::collections::HashMap::new(), + translated_modules: std::collections::HashMap::new(), target_format, } } - /// P7 — does the translator cache already have a pipeline for this - /// (vs, ps) pair? + /// P7 — has the translator already produced a WGSL *module* for this + /// (vs, ps) pair? (A per-render-state pipeline may still need building.) pub fn has_translated(&self, vs_blob_key: u32, ps_blob_key: u32) -> bool { - self.translated_cache + self.translated_modules .contains_key(&(vs_blob_key, ps_blob_key)) } - /// P7 — fetch a cached translator pipeline. `None` if not yet built. - pub fn translated_pipeline( - &self, - vs_blob_key: u32, - ps_blob_key: u32, - ) -> Option<&wgpu::RenderPipeline> { - self.translated_cache - .get(&(vs_blob_key, ps_blob_key)) - } - - /// P7 — compile a translator-produced WGSL module into a - /// `wgpu::RenderPipeline` and insert it into the cache keyed on - /// `(vs_blob_key, ps_blob_key)`. Returns `true` on success. Duplicate - /// inserts are no-ops. Emits `gpu.shader.compile_ok` on success. + /// P7 — compile a translator-produced WGSL module and cache it keyed on + /// `(vs_blob_key, ps_blob_key)`. The actual `RenderPipeline` (which also + /// depends on the per-draw blend/mask state) is built lazily by + /// [`render_one_translated`]. Returns `true` on success. pub fn insert_translated( &mut self, device: &wgpu::Device, @@ -420,7 +560,7 @@ impl XenosPipeline { wgsl: &str, ) -> bool { let key = (vs_blob_key, ps_blob_key); - if self.translated_cache.contains_key(&key) { + if self.translated_modules.contains_key(&key) { return true; } let shader = match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { @@ -436,31 +576,42 @@ impl XenosPipeline { return false; } }; + self.translated_modules.insert(key, shader); + metrics::counter!("gpu.shader.compile_ok").increment(1); + true + } + + /// iterate-3Y: ensure a translator pipeline exists for `(vs,ps,rstate)`, + /// building it from the cached module + the per-draw color/blend target. + fn ensure_translated_for_state( + &mut self, + device: &wgpu::Device, + vs_key: u32, + ps_key: u32, + rstate: RenderState, + ) -> bool { + let pkey = (vs_key, ps_key, rstate); + if self.translated_cache.contains_key(&pkey) { + return true; + } + let Some(module) = self.translated_modules.get(&(vs_key, ps_key)) else { + return false; + }; + let target = rstate.color_target(self.target_format); let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { label: Some("xenos translated pipeline"), layout: Some(&self.pipeline_layout), vertex: wgpu::VertexState { - module: &shader, + module, entry_point: "vs_main", compilation_options: Default::default(), buffers: &[], }, fragment: Some(wgpu::FragmentState { - module: &shader, + module, entry_point: "fs_main", compilation_options: Default::default(), - targets: &[Some(wgpu::ColorTargetState { - format: self.target_format, - blend: Some(wgpu::BlendState { - color: wgpu::BlendComponent { - src_factor: wgpu::BlendFactor::SrcAlpha, - dst_factor: wgpu::BlendFactor::OneMinusSrcAlpha, - operation: wgpu::BlendOperation::Add, - }, - alpha: wgpu::BlendComponent::OVER, - }), - write_mask: wgpu::ColorWrites::ALL, - })], + targets: &[Some(target)], }), primitive: wgpu::PrimitiveState { topology: wgpu::PrimitiveTopology::TriangleList, @@ -476,22 +627,65 @@ impl XenosPipeline { multiview: None, cache: None, }); - self.translated_cache.insert(key, pipeline); - metrics::counter!("gpu.shader.compile_ok").increment(1); + self.translated_cache.insert(pkey, pipeline); true } - /// Render one draw with the translator-produced pipeline instead of - /// the interpreter. Mirrors [`render_one`] except the bound pipeline - /// is swapped for `pipeline`. - pub fn render_one_with_pipeline( - &self, + /// iterate-3Y: ensure an interpreter pipeline exists for `rstate`. + fn ensure_interp_for_state(&mut self, device: &wgpu::Device, rstate: RenderState) { + if self.interp_cache.contains_key(&rstate) { + return; + } + let target = rstate.color_target(self.target_format); + let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { + label: Some("xenos interp pipeline (per-state)"), + layout: Some(&self.pipeline_layout), + vertex: wgpu::VertexState { + module: &self.interp_shader, + entry_point: "vs_main", + compilation_options: Default::default(), + buffers: &[], + }, + fragment: Some(wgpu::FragmentState { + module: &self.interp_shader, + entry_point: "fs_main", + compilation_options: Default::default(), + targets: &[Some(target)], + }), + primitive: wgpu::PrimitiveState { + topology: wgpu::PrimitiveTopology::TriangleList, + strip_index_format: None, + front_face: wgpu::FrontFace::Ccw, + cull_mode: None, + polygon_mode: wgpu::PolygonMode::Fill, + unclipped_depth: false, + conservative: false, + }, + depth_stencil: None, + multisample: wgpu::MultisampleState::default(), + multiview: None, + cache: None, + }); + self.interp_cache.insert(rstate, pipeline); + } + + /// iterate-3Y: render one draw through the translator pipeline built for + /// this draw's render state. Returns `false` if no module is cached for + /// `(vs,ps)` (caller should fall back to the interpreter). + pub fn render_one_translated( + &mut self, + device: &wgpu::Device, queue: &wgpu::Queue, encoder: &mut wgpu::CommandEncoder, target_view: &wgpu::TextureView, req: DrawRequest, - pipeline: &wgpu::RenderPipeline, - ) { + vs_key: u32, + ps_key: u32, + rstate: RenderState, + ) -> bool { + if !self.ensure_translated_for_state(device, vs_key, ps_key, rstate) { + return false; + } let cb = DrawConstants { draw_index: req.draw_index, vertex_count: req.vertex_count.max(3), @@ -501,7 +695,10 @@ impl XenosPipeline { ndc_offset: req.ndc_offset, }; queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb)); - + let pipeline = self + .translated_cache + .get(&(vs_key, ps_key, rstate)) + .expect("just ensured"); let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { label: Some("xenos translated draw"), color_attachments: &[Some(wgpu::RenderPassColorAttachment { @@ -521,6 +718,7 @@ impl XenosPipeline { pass.set_bind_group(1, &self.tex_bind_group, &[]); let rounded = req.vertex_count.div_ceil(3) * 3; pass.draw(0..rounded.max(3), 0..1); + true } /// Number of distinct translator pipelines cached. Surfaced to the HUD. @@ -612,14 +810,20 @@ impl XenosPipeline { queue.write_buffer(&self.vertex_buffer, 0, &bytes[..bytes.len().min(max)]); } - /// Render one captured draw. + /// Render one captured draw through the interpreter, using the per-draw + /// `rstate` (blend/write-mask) so flat draws composite correctly even + /// when their (vs,ps) didn't translate. `RenderState::OPAQUE` reproduces + /// the legacy fixed behaviour for procedural/synthetic draws. pub fn render_one( - &self, + &mut self, + device: &wgpu::Device, queue: &wgpu::Queue, encoder: &mut wgpu::CommandEncoder, target_view: &wgpu::TextureView, req: DrawRequest, + rstate: RenderState, ) { + self.ensure_interp_for_state(device, rstate); let cb = DrawConstants { draw_index: req.draw_index, vertex_count: req.vertex_count.max(3), @@ -630,6 +834,10 @@ impl XenosPipeline { }; queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb)); + let pipeline = self + .interp_cache + .get(&rstate) + .expect("just ensured"); let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { label: Some("xenos draw"), color_attachments: &[Some(wgpu::RenderPassColorAttachment { @@ -644,7 +852,7 @@ impl XenosPipeline { timestamp_writes: None, occlusion_query_set: None, }); - pass.set_pipeline(&self.pipeline); + pass.set_pipeline(pipeline); pass.set_bind_group(0, &self.bind_group, &[]); pass.set_bind_group(1, &self.tex_bind_group, &[]); let rounded = req.vertex_count.div_ceil(3) * 3;