diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs index 78f62c6..5a41154 100644 --- a/crates/xenia-app/src/main.rs +++ b/crates/xenia-app/src/main.rs @@ -4423,6 +4423,12 @@ fn run_with_ui( .map_err(|e| anyhow::anyhow!("winit event loop build failed: {e}"))?; let (ui_handles, kernel_bridge) = xenia_ui::build(event_loop.create_proxy()); kernel.ui = Some(kernel_bridge); + // iterate-3O: enable per-draw geometry capture so the UI can replay real + // guest draws. Only on the `--ui` path; headless `check` never gets here, + // so the deterministic core/golden stays untouched. + if let Some(gpu) = kernel.gpu.as_inline_mut() { + gpu.enable_frame_capture(); + } let shutdown = std::sync::Arc::clone(&ui_handles.shutdown); let title_owned = std::path::Path::new(title) diff --git a/crates/xenia-gpu/src/draw_capture.rs b/crates/xenia-gpu/src/draw_capture.rs new file mode 100644 index 0000000..2453e01 --- /dev/null +++ b/crates/xenia-gpu/src/draw_capture.rs @@ -0,0 +1,175 @@ +//! Per-draw geometry capture for the host UI's faithful-render path. +//! +//! The deterministic headless core (`check --gpu-inline`) never touches this +//! module — it is populated only when a UI bridge is installed and consumed +//! only by `crates/xenia-ui`. The goal is to hand the UI the *real* guest +//! geometry behind each `PM4_DRAW_INDX*` packet so it can rasterize the +//! actual splash vertices instead of synthetic placeholder shapes. +//! +//! What the WGSL pipeline needs to reconstruct one draw (see +//! `shaders/xenos_interp.wgsl` `vs_main` / `interpret_vertex_fetch`): +//! * the active VS/PS blob keys (already published as assets), +//! * the primitive type + the host vertex count to issue, +//! * the raw guest vertex-buffer bytes for the fetched window, and +//! * the *dword base* of that window so the shader can rebase the absolute +//! fetch-constant address into the uploaded buffer. +//! +//! The hard part is sourcing the vertex window: the VS reads a vertex-fetch +//! constant (`xe_gpu_vertex_fetch_t`) whose dword-0 carries the absolute +//! guest dword address. We parse the active VS, find its first vertex fetch, +//! read that fetch constant out of the register file, then copy a bounded +//! window of guest memory starting at the fetch base. + +use xenia_memory::access::MemoryAccess; + +use crate::draw_state::{IndexSize, IndexSource, PrimitiveType}; +use crate::register_file::RegisterFile; + +/// Texture-fetch / vertex-fetch constant region base, in register indices. +/// Each fetch constant is 6 dwords (`xe_gpu_*_fetch_t`). +const CONST_BASE_FETCH: u32 = 0x4800; + +/// Upper bound (in dwords) on the vertex window we copy per draw. The splash +/// UI draws are tiny (3–4 verts × ≤4 dwords); 64 KiB of dwords is generous +/// slack while bounding the per-frame copy cost and the 16 MiB host buffer. +const MAX_WINDOW_DWORDS: u32 = 16 * 1024; + +/// One captured draw, with enough real state for the UI to replay it through +/// the existing wgpu Xenos pipeline. +#[derive(Clone, Debug)] +pub struct DrawCapture { + /// Monotonic global draw index (matches `GpuStats::draws_seen` at capture). + pub draw_index: u32, + /// Xenos primitive-type code (see `SwapInfo::last_draw_prim` encoding). + pub prim_code: u32, + /// Host vertex count to issue (post primitive-processor rewrite). + pub host_vertex_count: u32, + /// Active VS blob key at draw time (0 = none). + pub vs_key: u32, + /// Active PS blob key at draw time (0 = none). + pub ps_key: u32, + /// Raw guest dwords of the fetched vertex window (host-endian as stored in + /// guest memory — the WGSL applies the per-format endian swap). `addr 0` + /// of this buffer corresponds to guest dword `window_base_dwords`. + pub vertex_dwords: Vec, + /// Guest dword address that maps to index 0 of `vertex_dwords`. The shader + /// subtracts this from the fetch-constant base to index `vertex_dwords`. + pub window_base_dwords: u32, + /// `true` when we successfully resolved a real vertex window. When `false` + /// the UI falls back to its procedural geometry for this draw (honest: + /// nothing faked, just "couldn't source real vertices"). + pub has_real_vertices: bool, +} + +/// Encode a [`PrimitiveType`] as the raw Xenos code used across the bridge. +pub fn prim_code(p: PrimitiveType) -> u32 { + match p { + PrimitiveType::None => 0, + PrimitiveType::PointList => 1, + PrimitiveType::LineList => 2, + PrimitiveType::LineStrip => 3, + PrimitiveType::TriangleList => 4, + PrimitiveType::TriangleFan => 5, + PrimitiveType::TriangleStrip => 6, + PrimitiveType::RectangleList => 8, + PrimitiveType::QuadList => 13, + PrimitiveType::Unknown(x) => x as u32, + } +} + +/// Resolve the first vertex-fetch window referenced by the parsed VS. +/// +/// Walks the VS instruction stream for the first `vfetch` (mini) instruction, +/// reads its fetch constant from `rf`, and copies a bounded window of guest +/// memory starting at the fetch base. Returns `(dwords, window_base_dwords)` +/// or `None` if the VS has no vertex fetch or the constant is malformed. +fn resolve_vertex_window( + parsed_vs: &crate::ucode::ParsedShader, + rf: &RegisterFile, + mem: &dyn MemoryAccess, +) -> Option<(Vec, u32)> { + // The instruction block is 3 dwords per ALU/fetch triple. We don't have + // per-triple kind flags here, so we scan every triple and accept the + // first one that decodes as a *vertex* fetch with a plausible constant. + let instrs = &parsed_vs.instructions; + let mut fetch_const: Option = None; + let mut t = 0usize; + while t + 2 < instrs.len() { + let w0 = instrs[t]; + let w1 = instrs[t + 1]; + let w2 = instrs[t + 2]; + if let crate::ucode::fetch::FetchInstruction::Vertex(vf) = + crate::ucode::fetch::decode_fetch([w0, w1, w2]) + { + // Validate the referenced fetch constant is a real vertex fetch + // (type==3, kVertex) before trusting it. + let fc = vf.fetch_const as u32; + let dword0 = rf.read(CONST_BASE_FETCH + fc * 6); + if dword0 & 0x3 == 3 { + fetch_const = Some(vf.fetch_const); + break; + } + } + t += 3; + } + let fc = fetch_const? as u32; + let dword0 = rf.read(CONST_BASE_FETCH + fc * 6); + let dword1 = rf.read(CONST_BASE_FETCH + fc * 6 + 1); + // address:30 at bits[31:2] of dword0 (in bytes once masked). + let base_bytes = dword0 & 0xFFFF_FFFC; + if base_bytes == 0 { + return None; + } + // size:24 at bits[25:2] of dword1, in dwords. Clamp to our window cap. + let size_dwords = ((dword1 >> 2) & 0x00FF_FFFF).clamp(1, MAX_WINDOW_DWORDS); + let window_base_dwords = base_bytes >> 2; + let mut dwords = Vec::with_capacity(size_dwords as usize); + for i in 0..size_dwords { + let addr = base_bytes.wrapping_add(i * 4); + if addr < base_bytes { + break; // wrap guard + } + // `read_u32` composes big-endian bytes into the u32 value; the WGSL's + // `gpu_swap` expects the *raw little-endian dword* as it sits in guest + // memory, so undo the BE composition with `swap_bytes`. + dwords.push(mem.read_u32(addr).swap_bytes()); + } + if dwords.is_empty() { + return None; + } + Some((dwords, window_base_dwords)) +} + +/// Build a [`DrawCapture`] for one draw. Best-effort: when the vertex window +/// can't be resolved, `has_real_vertices` is `false` and the UI falls back to +/// procedural geometry (never fabricated pixels). +#[allow(clippy::too_many_arguments)] +pub fn build( + draw_index: u32, + primitive: PrimitiveType, + host_vertex_count: u32, + _index_source: IndexSource, + _index_size: IndexSize, + vs_key: u32, + ps_key: u32, + parsed_vs: Option<&crate::ucode::ParsedShader>, + rf: &RegisterFile, + mem: &dyn MemoryAccess, +) -> DrawCapture { + let (vertex_dwords, window_base_dwords, has_real) = match parsed_vs + .and_then(|vs| resolve_vertex_window(vs, rf, mem)) + { + Some((d, base)) => (d, base, true), + None => (Vec::new(), 0, false), + }; + DrawCapture { + draw_index, + prim_code: prim_code(primitive), + host_vertex_count, + vs_key, + ps_key, + vertex_dwords, + window_base_dwords, + has_real_vertices: has_real, + } +} diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs index 28a92e5..9ab5768 100644 --- a/crates/xenia-gpu/src/gpu_system.rs +++ b/crates/xenia-gpu/src/gpu_system.rs @@ -436,6 +436,12 @@ pub struct GpuSystem { /// `GpuSystem::new` and lives for the whole GPU lifetime — no /// per-frame churn. pub edram: crate::edram::ShadowEdram, + /// UI-only: when `Some`, every `PM4_DRAW_INDX*` appends a + /// [`crate::draw_capture::DrawCapture`] here so the host UI can replay the + /// real guest geometry. `None` in headless/deterministic mode — the + /// `--gpu-inline` golden never enables this, so capture is entirely inert + /// for `check`. Drained (taken) by `vd_swap` at each present. + pub frame_captures: Option>, } impl GpuSystem { @@ -463,6 +469,15 @@ impl GpuSystem { texture_cache: crate::texture_cache::TextureCache::new(), last_draw_textures: Vec::new(), edram: crate::edram::ShadowEdram::new(), + frame_captures: None, + } + } + + /// Enable per-draw geometry capture for the host UI. Inert (and never + /// called) in headless/deterministic mode. Idempotent. + pub fn enable_frame_capture(&mut self) { + if self.frame_captures.is_none() { + self.frame_captures = Some(Vec::new()); } } @@ -1295,8 +1310,56 @@ impl GpuSystem { "gpu: DRAW_INDX captured" ); self.last_draw = Some(ds); + let host_vertex_count = processed.host_vertex_count; self.last_primitive = Some(processed); + // iterate-3O: UI-only per-draw geometry capture. Resolves the + // real guest vertex window behind this draw (from the active + // VS's vertex-fetch constant) so the host UI can replay the + // actual splash geometry instead of synthetic shapes. Entirely + // inert in headless/deterministic mode (`frame_captures` is + // `None`), so the `--gpu-inline` golden is unaffected. + if self.frame_captures.is_some() { + let vs_key = self.active_vs_key.unwrap_or(0); + let ps_key = self.active_ps_key.unwrap_or(0); + let parsed_vs = self + .active_vs_key + .and_then(|k| self.shader_blobs.get(&k)) + .map(|b| crate::ucode::parse_shader(&b.dwords)); + let (idx_src, idx_size) = match ds.index_source { + crate::draw_state::IndexSource::Dma { index_size, .. } => { + (ds.index_source, index_size) + } + crate::draw_state::IndexSource::Immediate { index_size } => { + (ds.index_source, index_size) + } + crate::draw_state::IndexSource::AutoIndex => { + (ds.index_source, crate::draw_state::IndexSize::Sixteen) + } + }; + let cap = crate::draw_capture::build( + self.stats.draws_seen as u32, + ds.primitive, + host_vertex_count, + idx_src, + idx_size, + vs_key, + ps_key, + parsed_vs.as_ref(), + &self.register_file, + mem, + ); + if let Some(caps) = self.frame_captures.as_mut() { + // Bound the per-frame list so a runaway frame can't grow + // host memory without limit; keep the most recent. + const MAX_CAPS: usize = 4096; + if caps.len() >= MAX_CAPS { + caps.remove(0); + } + caps.push(cap); + } + } + // P5b: decode the textures the *active pixel shader* actually // samples. Parse the bound PS, collect its `tfetch` // fetch-constant slots, read each 6-dword fetch constant from diff --git a/crates/xenia-gpu/src/lib.rs b/crates/xenia-gpu/src/lib.rs index a7539d9..b1dbace 100644 --- a/crates/xenia-gpu/src/lib.rs +++ b/crates/xenia-gpu/src/lib.rs @@ -12,6 +12,7 @@ //! [`gpu_system::GpuSystem`]. pub mod command_processor; +pub mod draw_capture; pub mod draw_state; pub mod edram; pub mod gpu_system; diff --git a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl index 7b1d969..1eca656 100644 --- a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl +++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl @@ -20,7 +20,12 @@ struct XenosDrawConstants { draw_index: u32, vertex_count: u32, prim_kind: u32, - _pad: u32, + // iterate-3O: guest dword address that maps to index 0 of `vertex_buffer`. + // The CPU uploads a bounded guest-memory window starting at the active + // vertex-fetch base; the shader subtracts this base from the absolute + // fetch-constant address so it indexes the uploaded window. 0 means "no + // real vertex window" (procedural fallback path). + vertex_base_dwords: u32, }; struct XenosConstants { @@ -652,7 +657,15 @@ fn interpret_vertex_fetch(t: u32) { // dword 1 carries (endian[1:0], size[25:2]). let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u]; let fc1 = xenos_consts.fetch[fetch_const * 2u + 1u]; - let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u; + // iterate-3O: the fetch constant holds an *absolute* guest dword address. + // The CPU uploaded a window of guest memory starting at + // `draw_ctx.vertex_base_dwords`, so rebase the absolute address into that + // window. When no real window was published (`vertex_base_dwords == 0`) + // keep the absolute value (the `addr < n` guards below then skip the read + // and the procedural fallback position is used). + let abs_base = (fc0 & 0xFFFFFFFCu) >> 2u; + let base_dwords = select(abs_base, abs_base - draw_ctx.vertex_base_dwords, + draw_ctx.vertex_base_dwords != 0u && abs_base >= draw_ctx.vertex_base_dwords); // GPUBUG-102: per-format endian byte-swap. Xbox 360 vertex data is // big-endian; the host is little-endian. Pre-fix every dword was // bitcast as-is — vertex positions were byte-reversed garbage. diff --git a/crates/xenia-gpu/src/translator.rs b/crates/xenia-gpu/src/translator.rs index afcbc20..843c219 100644 --- a/crates/xenia-gpu/src/translator.rs +++ b/crates/xenia-gpu/src/translator.rs @@ -94,7 +94,7 @@ struct XenosDrawConstants { draw_index: u32, vertex_count: u32, prim_kind: u32, - _pad: u32, + vertex_base_dwords: u32, }; struct XenosConstants { @@ -418,7 +418,9 @@ impl EmitCtx { "{{ let fc0 = xenos_consts.fetch[{fc0_idx}u]; \ let fc1 = xenos_consts.fetch[{fc1_idx}u]; \ let endian = fc1 & 0x3u; \ - let base = (fc0 & 0xFFFFFFFCu) >> 2u; \ + let abs_base = (fc0 & 0xFFFFFFFCu) >> 2u; \ + let base = select(abs_base, abs_base - draw_ctx.vertex_base_dwords, \ + draw_ctx.vertex_base_dwords != 0u && abs_base >= draw_ctx.vertex_base_dwords); \ let vidx = u32(r[{src_reg}u].x); \ let addr = base + vidx * 4u; \ let n = arrayLength(&vertex_buffer); \ diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs index 344fa1b..f0355b0 100644 --- a/crates/xenia-kernel/src/exports.rs +++ b/crates/xenia-kernel/src/exports.rs @@ -3231,6 +3231,16 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { metrics::gauge!("gpu.texture_cache.entries") .set(gpu_inline.texture_cache.len() as f64); ui.publish_texture(published); + + // iterate-3O: publish this frame's captured per-draw geometry and + // reset the accumulator for the next frame. The UI replays these as + // real guest draws (real vertices + prim type) instead of synthetic + // placeholder shapes. `frame_captures` is `Some` only under `--ui`. + if let Some(caps) = gpu_inline.frame_captures.as_mut() { + let drained = std::mem::take(caps); + metrics::counter!("gpu.geometry.published").increment(drained.len() as u64); + ui.publish_geometry(drained); + } } // Notify the UI. if let Some(ui) = state.ui.clone() { diff --git a/crates/xenia-kernel/src/ui_bridge.rs b/crates/xenia-kernel/src/ui_bridge.rs index a4b2289..59798b3 100644 --- a/crates/xenia-kernel/src/ui_bridge.rs +++ b/crates/xenia-kernel/src/ui_bridge.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicU64}; +use xenia_gpu::draw_capture::DrawCapture; use xenia_gpu::texture_cache::TextureKey; use xenia_gpu::xenos_constants::XenosConstantsBlock; use xenia_hid::GamepadState; @@ -133,6 +134,14 @@ pub struct UiBridge { /// reverts to its magenta stub. pub publish_texture: Arc)>) + Send + Sync>, + /// iterate-3O real-render slice: at each `VdSwap`, the kernel hands the + /// UI the per-draw geometry captured this frame (one [`DrawCapture`] per + /// `PM4_DRAW_INDX*`), including the real guest vertex window. The UI + /// replays them through the Xenos wgpu pipeline so the splash renders its + /// actual geometry instead of synthetic placeholder shapes. Empty in the + /// degenerate case (no draws or capture disabled). + pub publish_geometry: + Arc) + Send + Sync>, } impl UiBridge { @@ -182,4 +191,9 @@ impl UiBridge { pub fn publish_texture(&self, tex: Option<(TextureKey, Vec)>) { (self.publish_texture)(tex); } + + /// Hand this frame's captured per-draw geometry to the UI. + pub fn publish_geometry(&self, caps: Vec) { + (self.publish_geometry)(caps); + } } diff --git a/crates/xenia-ui/src/app.rs b/crates/xenia-ui/src/app.rs index e36e13f..a80b89b 100644 --- a/crates/xenia-ui/src/app.rs +++ b/crates/xenia-ui/src/app.rs @@ -181,10 +181,11 @@ impl App { y += line_h; let (fbw, fbh) = rs.frontbuffer_size(); let render_line = format!( - "Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} tex-cache={:>3} fb={}x{}", + "Render: xdispatch: xlated={:>5} interp={:>5} xlated-pipelines={:>3} real-geo={:>5} tex-cache={:>3} fb={}x{}", rs.xenos_dispatches_translator, rs.xenos_dispatches_interpreter, rs.translated_pipeline_count(), + rs.real_geometry_draws(), rs.host_texture_count(), fbw, fbh, @@ -372,49 +373,6 @@ impl ApplicationHandler for App { self.last_xenos_swap_frame = frame_idx; } let delta = (draws_total - already) as u32; - let (verts_hint, prim_kind, vs_key, ps_key) = self - .last_swap_info - .map(|s| { - ( - s.last_draw_vertex_count.max(3), - s.last_draw_prim, - s.vs_blob_key, - s.ps_blob_key, - ) - }) - .unwrap_or((3, 4, 0, 0)); - // Look up blobs + constants from the bridge and - // pack into the WGSL-interpreter layout. Empty - // slices produce zero-clause packed buffers — the - // WGSL walker short-circuits and the placeholder - // export path still renders. - let raw_vs: Vec = self - .handles - .shader_blobs - .lock() - .ok() - .and_then(|g| g.get(&vs_key).cloned()) - .unwrap_or_default(); - let raw_ps: Vec = self - .handles - .shader_blobs - .lock() - .ok() - .and_then(|g| g.get(&ps_key).cloned()) - .unwrap_or_default(); - let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs); - let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps); - // First time we see a blob key, run the static - // metrics analyzer. Keyed on (stage_tag, blob_key) - // because the guest can reuse a key across stages. - if self.seen_shader_blobs.insert((0u8, vs_key)) { - xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs"); - } - if self.seen_shader_blobs.insert((1u8, ps_key)) { - xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps"); - } - let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs); - let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps); let constants = self .handles .xenos_constants @@ -431,19 +389,72 @@ impl ApplicationHandler for App { .ok() .and_then(|g| g.clone()); rs.bind_primary_texture(tex_payload); - rs.dispatch_xenos_draws( - already, - delta, - verts_hint, - prim_kind, - vs_key, - ps_key, - &parsed_vs, - &parsed_ps, - &vs_packed, - &ps_packed, - &constants, - ); + + // iterate-3O real-render slice: prefer replaying the + // *real* captured guest geometry. The kernel publishes + // one `DrawCapture` per `PM4_DRAW_INDX*` this frame + // (real vertices + prim type + shader keys). Fall back + // to the legacy synthetic dispatch only when no capture + // is available (e.g. capture disabled), so we never + // regress to a blank screen. + let captures: Vec = self + .handles + .geometry + .lock() + .map(|g| g.clone()) + .unwrap_or_default(); + let blobs: std::collections::HashMap> = self + .handles + .shader_blobs + .lock() + .map(|g| g.clone()) + .unwrap_or_default(); + if !captures.is_empty() { + rs.dispatch_xenos_captures( + &captures, + &blobs, + &constants, + &mut self.seen_shader_blobs, + ); + } else { + // Legacy synthetic-geometry fallback (placeholder). + let (verts_hint, prim_kind, vs_key, ps_key) = self + .last_swap_info + .map(|s| { + ( + s.last_draw_vertex_count.max(3), + s.last_draw_prim, + s.vs_blob_key, + s.ps_blob_key, + ) + }) + .unwrap_or((3, 4, 0, 0)); + let raw_vs = blobs.get(&vs_key).cloned().unwrap_or_default(); + let raw_ps = blobs.get(&ps_key).cloned().unwrap_or_default(); + let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs); + let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps); + if self.seen_shader_blobs.insert((0u8, vs_key)) { + xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs"); + } + if self.seen_shader_blobs.insert((1u8, ps_key)) { + xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps"); + } + let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs); + let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps); + rs.dispatch_xenos_draws( + already, + delta, + verts_hint, + prim_kind, + vs_key, + ps_key, + &parsed_vs, + &parsed_ps, + &vs_packed, + &ps_packed, + &constants, + ); + } } } else { Self::ingest_frontbuffer( diff --git a/crates/xenia-ui/src/bridge.rs b/crates/xenia-ui/src/bridge.rs index 6bf3d1e..e27d47d 100644 --- a/crates/xenia-ui/src/bridge.rs +++ b/crates/xenia-ui/src/bridge.rs @@ -18,6 +18,7 @@ use std::sync::Mutex; use crossbeam_utils::atomic::AtomicCell; use winit::event_loop::EventLoopProxy; +use xenia_gpu::draw_capture::DrawCapture; use xenia_gpu::texture_cache::TextureKey; use xenia_gpu::xenos_constants::XenosConstantsBlock; use xenia_hid::GamepadState; @@ -66,6 +67,10 @@ pub struct UiHandles { /// fetch-constant slot 0 into linear bytes that the UI should /// upload into the host cache and bind at `@group(1) @binding(0)`. pub primary_texture: Arc)>>>, + /// iterate-3O: the most recent frame's captured per-draw geometry. The + /// redraw path drains this to replay real guest draws. Replaced wholesale + /// each `VdSwap`. + pub geometry: Arc>>, } /// Swap event posted by the CPU-side `VdSwap` handler via @@ -89,6 +94,7 @@ pub fn build(proxy: EventLoopProxy) -> (UiHandles, UiBridge) { let xenos_constants = Arc::new(Mutex::new(XenosConstantsBlock::default())); let primary_texture: Arc)>>> = Arc::new(Mutex::new(None)); + let geometry: Arc>> = Arc::new(Mutex::new(Vec::new())); let kernel_bridge = UiBridge { gamepad: { @@ -144,6 +150,14 @@ pub fn build(proxy: EventLoopProxy) -> (UiHandles, UiBridge) { } }) }, + publish_geometry: { + let geo = Arc::clone(&geometry); + Arc::new(move |caps| { + if let Ok(mut lock) = geo.lock() { + *lock = caps; + } + }) + }, }; let handles = UiHandles { @@ -155,6 +169,7 @@ pub fn build(proxy: EventLoopProxy) -> (UiHandles, UiBridge) { shader_blobs, xenos_constants, primary_texture, + geometry, }; (handles, kernel_bridge) } diff --git a/crates/xenia-ui/src/render.rs b/crates/xenia-ui/src/render.rs index ea19a49..979306f 100644 --- a/crates/xenia-ui/src/render.rs +++ b/crates/xenia-ui/src/render.rs @@ -84,6 +84,9 @@ pub struct RenderState { /// the shader, or (c) we're running the slow interpreter path. pub xenos_dispatches_translator: u64, pub xenos_dispatches_interpreter: u64, + /// iterate-3O: running total of replayed draws that carried a real guest + /// vertex window (vs. the procedural fallback). Surfaced on the HUD. + real_geometry_draws: u64, /// One-shot latch so we emit a tracing::info! on the **first** real /// draw dispatch rather than spamming every frame. Pairs with the /// "first translator compile" latch below. @@ -447,6 +450,7 @@ impl RenderState { fallback_rgb: [0.06, 0.06, 0.09], xenos_pipeline, xenos_draws_rendered: 0, + real_geometry_draws: 0, xenos_dispatches_translator: 0, xenos_dispatches_interpreter: 0, first_dispatch_logged: false, @@ -657,6 +661,8 @@ impl RenderState { draw_index: idx, vertex_count: vertex_count_hint.max(3), prim_kind, + // Synthetic fallback path: no real vertex window. + vertex_base_dwords: 0, }; if use_translated && let Some(p) = self.xenos_pipeline.translated_pipeline(vs_key, ps_key) { @@ -707,12 +713,135 @@ impl RenderState { } } + /// iterate-3O real-render slice: replay a batch of *real* captured guest + /// draws. Unlike [`dispatch_xenos_draws`] (synthetic placeholder geometry), + /// each [`DrawCapture`] carries the actual guest vertex window, primitive + /// type, host vertex count, and the real (vs, ps) keys. Per capture we: + /// 1. upload the captured guest vertex bytes into `vertex_buffer` (b4), + /// 2. upload the matching VS/PS microcode + per-frame constants, + /// 3. render through the translated (P7) pipeline if it compiled, else + /// the interpreter — with `vertex_base_dwords` set so the shader + /// rebases its absolute fetch address into the uploaded window. + /// + /// Returns the number of captures that had a real vertex window (vs. the + /// procedural fallback), for HUD reporting. `shader_blobs` / `constants` + /// come from the bridge; `seen` records which blobs have had static + /// metrics emitted (one-shot per blob, matching the legacy path). + pub fn dispatch_xenos_captures( + &mut self, + captures: &[xenia_gpu::draw_capture::DrawCapture], + shader_blobs: &std::collections::HashMap>, + constants: &xenia_gpu::xenos_constants::XenosConstantsBlock, + seen: &mut std::collections::HashSet<(u8, u32)>, + ) -> u32 { + if captures.is_empty() { + return 0; + } + let mut real_count = 0u32; + let mut encoder = self + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("xenos capture replay"), + }); + for cap in captures { + let raw_vs = shader_blobs.get(&cap.vs_key).cloned().unwrap_or_default(); + let raw_ps = shader_blobs.get(&cap.ps_key).cloned().unwrap_or_default(); + let parsed_vs = xenia_gpu::ucode::parse_shader(&raw_vs); + let parsed_ps = xenia_gpu::ucode::parse_shader(&raw_ps); + if seen.insert((0u8, cap.vs_key)) { + xenia_gpu::shader_metrics::emit_for(&parsed_vs, "vs"); + } + if seen.insert((1u8, cap.ps_key)) { + xenia_gpu::shader_metrics::emit_for(&parsed_ps, "ps"); + } + let vs_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_vs); + let ps_packed = xenia_gpu::ucode::pack_for_wgsl(&parsed_ps); + // Upload this draw's shader + constants + real vertex window. + self.xenos_pipeline.upload_shader_and_constants( + &self.queue, + &vs_packed, + &ps_packed, + constants, + ); + if cap.has_real_vertices && !cap.vertex_dwords.is_empty() { + self.xenos_pipeline + .upload_vertex_data(&self.queue, &cap.vertex_dwords); + real_count += 1; + } + let use_translated = cap.vs_key != 0 + && cap.ps_key != 0 + && ensure_translated_pipeline( + &mut self.xenos_pipeline, + &self.device, + cap.vs_key, + cap.ps_key, + &parsed_vs, + &parsed_ps, + ); + let base = if cap.has_real_vertices { + cap.window_base_dwords + } else { + 0 + }; + let req = DrawRequest { + draw_index: cap.draw_index, + vertex_count: cap.host_vertex_count.max(3), + prim_kind: cap.prim_code, + vertex_base_dwords: base, + }; + if use_translated + && let Some(p) = self.xenos_pipeline.translated_pipeline(cap.vs_key, cap.ps_key) + { + self.xenos_pipeline.render_one_with_pipeline( + &self.queue, + &mut encoder, + &self.frontbuffer_view, + req, + p, + ); + self.xenos_dispatches_translator = + self.xenos_dispatches_translator.saturating_add(1); + } else { + self.xenos_pipeline.render_one( + &self.queue, + &mut encoder, + &self.frontbuffer_view, + req, + ); + self.xenos_dispatches_interpreter = + self.xenos_dispatches_interpreter.saturating_add(1); + } + } + self.queue.submit(std::iter::once(encoder.finish())); + self.xenos_draws_rendered = self + .xenos_draws_rendered + .saturating_add(captures.len() as u64); + self.real_geometry_draws = self + .real_geometry_draws + .saturating_add(real_count as u64); + if !self.first_dispatch_logged { + self.first_dispatch_logged = true; + tracing::info!( + captures = captures.len(), + real_vertex_draws = real_count, + "first Xenos capture batch replayed (real geometry)" + ); + } + real_count + } + /// Count of distinct translator pipelines compiled so far. Surfaced /// on the HUD as `xlated=N` to make "is P7 working?" observable. pub fn translated_pipeline_count(&self) -> usize { self.xenos_pipeline.translated_pipeline_count() } + /// Running count of captured draws that carried a real vertex window + /// (surfaced on the HUD). Updated by [`dispatch_xenos_captures`]. + pub fn real_geometry_draws(&self) -> u64 { + self.real_geometry_draws + } + /// Clear the frontbuffer to `[r,g,b,a]` in linear space. Matches the /// fallback clear the outer swapchain render does so the two stages /// agree on "no draws yet = dark navy". diff --git a/crates/xenia-ui/src/xenos_pipeline.rs b/crates/xenia-ui/src/xenos_pipeline.rs index b9fb647..d7bb247 100644 --- a/crates/xenia-ui/src/xenos_pipeline.rs +++ b/crates/xenia-ui/src/xenos_pipeline.rs @@ -36,7 +36,9 @@ struct DrawConstants { draw_index: u32, vertex_count: u32, prim_kind: u32, - _pad: u32, + /// iterate-3O: guest dword base of the uploaded `vertex_buffer` window. + /// The WGSL subtracts this from the absolute vertex-fetch address. + vertex_base_dwords: u32, } /// Submitted to [`XenosPipeline::render_one`] to render one captured draw. @@ -48,6 +50,9 @@ pub struct DrawRequest { pub vertex_count: u32, /// Xenos primitive-type code; shader may branch on it in P3b+. pub prim_kind: u32, + /// iterate-3O: guest dword base of the per-draw vertex window uploaded to + /// `vertex_buffer` (b4). 0 = no real vertex window (procedural fallback). + pub vertex_base_dwords: u32, } /// Reasonable upper bound on a single shader blob (dwords). Most Xbox 360 @@ -193,7 +198,7 @@ impl XenosPipeline { draw_index: 0, vertex_count: 3, prim_kind: 4, - _pad: 0, + vertex_base_dwords: 0, }; let draw_ctx_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { label: Some("xenos draw ctx"), @@ -480,7 +485,7 @@ impl XenosPipeline { draw_index: req.draw_index, vertex_count: req.vertex_count.max(3), prim_kind: req.prim_kind, - _pad: 0, + vertex_base_dwords: req.vertex_base_dwords, }; queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb)); @@ -606,7 +611,7 @@ impl XenosPipeline { draw_index: req.draw_index, vertex_count: req.vertex_count.max(3), prim_kind: req.prim_kind, - _pad: 0, + vertex_base_dwords: req.vertex_base_dwords, }; queue.write_buffer(&self.draw_ctx_buffer, 0, bytemuck::bytes_of(&cb));