diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs index e3b77e7..28a92e5 100644 --- a/crates/xenia-gpu/src/gpu_system.rs +++ b/crates/xenia-gpu/src/gpu_system.rs @@ -78,6 +78,30 @@ pub fn physical_to_backing(addr: u32) -> u32 { } } +/// Max guest page-version over the `[base, base+len)` span, walking 4 KiB +/// pages via the `MemoryAccess` trait's `page_version`. +/// +/// The concrete heap exposes an inherent `max_page_version(base, len)`, but +/// the draw handler only holds `&dyn MemoryAccess` (which carries the coarser +/// `page_version(addr)` accessor). This is byte-equivalent to +/// `heap::max_page_version` and stays a pure function of the per-page write +/// counters (no wall-clock), so texture-decode timing remains deterministic. +fn span_max_version(mem: &dyn MemoryAccess, base: u32, len: u32) -> u64 { + const PAGE: u32 = 0x1000; + let last = base.saturating_add(len.saturating_sub(1)); + let mut page = base & !(PAGE - 1); + let last_page = last & !(PAGE - 1); + let mut max = 0u64; + loop { + max = max.max(mem.page_version(page)); + if page >= last_page { + break; + } + page = page.wrapping_add(PAGE); + } + max +} + /// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets. #[derive(Debug, Clone)] pub struct ShaderBlob { @@ -400,6 +424,12 @@ pub struct GpuSystem { /// on every texture-fetch resolution; the UI thread sees the decoded /// bytes via `UiBridge::publish_texture`. pub texture_cache: crate::texture_cache::TextureCache, + /// P5b: textures decoded at the most recent `PM4_DRAW_INDX*`, keyed off + /// the *active* pixel shader's real `tfetch` fetch-constant slots (not a + /// hardcoded slot). `vd_swap` publishes the first of these to the UI so + /// the replay binds the texture the draw actually samples. Cleared and + /// repopulated each draw; empty when the active PS issues no `tfetch`. + pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, Vec)>, /// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and /// (future) host-render-target readback; read by the resolve byte-copy /// path that writes tiled pixels into guest memory. Allocated once at @@ -431,6 +461,7 @@ impl GpuSystem { rt_cache: crate::render_target_cache::RenderTargetCache::new(), last_resolve: None, texture_cache: crate::texture_cache::TextureCache::new(), + last_draw_textures: Vec::new(), edram: crate::edram::ShadowEdram::new(), } } @@ -1265,6 +1296,60 @@ impl GpuSystem { ); self.last_draw = Some(ds); self.last_primitive = Some(processed); + + // P5b: decode the textures the *active pixel shader* actually + // samples. Parse the bound PS, collect its `tfetch` + // fetch-constant slots, read each 6-dword fetch constant from + // the register file, and decode+cache it. `vd_swap` publishes + // the result. Empty for flat (no-tfetch) shaders — the + // dominant case on Sylpheed's current splash, where this stays + // inert until the textured logo draw is reached. + self.last_draw_textures.clear(); + if let Some(ps_key) = self.active_ps_key { + // Collect slots under an immutable borrow of `shader_blobs`, + // then drop it before mutating `texture_cache`. + let slots: Vec = match self.shader_blobs.get(&ps_key) { + Some(blob) => { + let parsed = crate::ucode::parse_shader(&blob.dwords); + crate::shader_metrics::tfetch_slots(&parsed) + } + None => Vec::new(), + }; + for slot in slots { + let mut fetch6 = [0u32; 6]; + for (k, w) in fetch6.iter_mut().enumerate() { + *w = self + .register_file + .read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32); + } + let Some(key) = crate::texture_cache::decode_fetch_constant(fetch6) else { + continue; + }; + let bi = key.format.block_info(); + let span_bytes = (key.pitch_texels as u32) + * (key.height as u32) + * (bi.bytes_per_block as u32) + / (bi.block_w as u32); + let version = span_max_version(mem, key.base_address, span_bytes.max(4)); + match self.texture_cache.ensure_cached(key, version, mem) { + Ok(entry) => { + self.last_draw_textures.push((entry.key, entry.bytes.clone())); + metrics::counter!( + "gpu.texture.decode", + "fmt" => format!("{:?}", key.format), + ) + .increment(1); + } + Err(e) => { + metrics::counter!( + "gpu.texture.reject", + "reason" => format!("{e:?}"), + ) + .increment(1); + } + } + } + } } pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => { // payload[0] = offset_type — bits[10:0] index, bits[23:16] type diff --git a/crates/xenia-gpu/src/primitive.rs b/crates/xenia-gpu/src/primitive.rs index 6bbafed..caffb99 100644 --- a/crates/xenia-gpu/src/primitive.rs +++ b/crates/xenia-gpu/src/primitive.rs @@ -5,9 +5,8 @@ //! rectangles) we rewrite indices on the CPU side so the host just sees a //! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`. //! -//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need -//! (list, strip, fan). Rectangle + quad expansions are stubs logged via -//! `tracing::warn!` for later. +//! Scope: list, strip, fan, quad, and rectangle expansions are all handled +//! (rectangles via CPU triangle-list rewrite — see `expand_rectangles`). use crate::draw_state::{IndexSize, PrimitiveType}; @@ -138,18 +137,43 @@ fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitiv } /// Rectangle lists: a Xenos-specific primitive where each group of 3 -/// vertices defines a right-angle rectangle by its three non-repeated -/// corners (the 4th is derived). The uber-shader doesn't support this yet; -/// the ucode translator will emulate it as a geometry-stage fake. For P3 -/// we emit an empty draw. -fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive { - tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)"); - metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1); +/// vertices defines a rectangle; the 4th corner is extrapolated as +/// `v3 = v0 + v2 - v1` (parallelogram completion). Canary expands this in a +/// host vertex-shader variant (`kRectangleListAsTriangleStrip`, +/// `primitive_processor.cc:389-456`): a 4-vertex triangle strip per rect with +/// the 4th corner synthesized *in the VS* from the host-vertex index. +/// +/// Our replay pipeline has no host-VS corner synthesis (and the procedural +/// `vs_main` does not consume `rewritten_indices` yet), so we mirror the +/// `expand_quads`/`expand_fan` CPU idiom and emit the 3 real vertices of each +/// rect as one triangle list `(v0,v1,v2)` — the visible lower half of the +/// rect. This un-rejects the draw and gives a faithful `host_vertex_count`. +/// +/// TODO: once `vs_main` does real vertex fetch + interpolation, upgrade to the +/// full quad — 6 indices `[v0,v1,v2, v2,v1,v3]` with a synthesized `v3` corner +/// — mirroring canary's `kRectangleListAsTriangleStrip`. +fn expand_rectangles(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive { + let rect_count = vertex_count / 3; + let mut out = Vec::with_capacity(3 * rect_count as usize); + let get = |i: u32| -> u32 { + match indices { + Some(buf) => buf[i as usize], + None => i, + } + }; + for r in 0..rect_count { + let base = r * 3; + out.push(get(base)); + out.push(get(base + 1)); + out.push(get(base + 2)); + } + let host_vertex_count = out.len() as u32; + metrics::counter!("gpu.primitive.expanded", "shape" => "rectangle_list").increment(1); ProcessedPrimitive { topology: HostTopology::TriangleList, - rewritten_indices: Some(Vec::new()), - host_vertex_count: 0, - rejected: true, + rewritten_indices: Some(out), + host_vertex_count, + rejected: false, } } @@ -213,6 +237,17 @@ mod tests { assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]); } + #[test] + fn rectangle_list_expansion() { + // 2 rects (6 verts) → one triangle (v0,v1,v2) per rect, not rejected. + let p = process(PrimitiveType::RectangleList, 6, None); + let idx = p.rewritten_indices.unwrap(); + assert_eq!(idx, vec![0, 1, 2, 3, 4, 5]); + assert_eq!(p.topology, HostTopology::TriangleList); + assert_eq!(p.host_vertex_count, 6); + assert!(!p.rejected); + } + #[test] fn widen_u16_indices_big_endian() { // 3 indices [1, 2, 0x1234] in BE u16. diff --git a/crates/xenia-gpu/src/shader_metrics.rs b/crates/xenia-gpu/src/shader_metrics.rs index 7aabe5e..c5d866b 100644 --- a/crates/xenia-gpu/src/shader_metrics.rs +++ b/crates/xenia-gpu/src/shader_metrics.rs @@ -174,6 +174,49 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) { } } +/// Collect the unique texture-fetch-constant slot indices a shader samples. +/// +/// Walks the same exec-clause / sequence-bitmap path as [`emit_for`] but only +/// extracts `TextureFetch.fetch_const` slots, deduplicated and in first-seen +/// order. The GPU draw handler uses this to decide which fetch constants to +/// decode + cache at draw time (keyed off the *active* pixel shader's real +/// `tfetch` instructions rather than a hardcoded slot). +pub fn tfetch_slots(parsed: &ParsedShader) -> Vec { + let mut slots: Vec = Vec::new(); + for clause in &parsed.cf { + if let ControlFlowInstruction::Exec { + address, + count, + sequence, + .. + } = clause + { + for i in 0..(*count as usize) { + let base = (*address as usize + i) * 3; + if base + 2 >= parsed.instructions.len() { + break; + } + // sequence bit layout: 2 bits per triple, hi bit = is-fetch. + let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0; + if !is_fetch { + continue; + } + let words = [ + parsed.instructions[base], + parsed.instructions[base + 1], + parsed.instructions[base + 2], + ]; + if let FetchInstruction::Texture(tf) = decode_fetch(words) { + if !slots.contains(&tf.fetch_const) { + slots.push(tf.fetch_const); + } + } + } + } + } + slots +} + fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) { if !buf.contains(&name) { buf.push(name); @@ -298,6 +341,46 @@ mod tests { emit_for(&shader, "vs"); } + /// `tfetch_slots` should extract the fetch-constant slot of a texture + /// fetch (and dedup), and return empty for a flat ALU-only shader. + #[test] + fn tfetch_slots_extracts_texture_fetch_constants() { + // word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, fetch_const=3 in + // bits[9:5] → 0x01 | (3 << 5) = 0x61. + let tfetch_w0: u32 = 0x01 | (3u32 << 5); + let shader = ParsedShader { + cf: vec![ + ControlFlowInstruction::Exec { + address: 0, + count: 2, + // triple 0 is a fetch (hi bit of its 2-bit field set), + // triple 1 is ALU. is_fetch = (sequence >> (i*2+1)) & 1. + sequence: 0b00_10, + is_end: false, + predicated: false, + predicate_condition: false, + }, + ControlFlowInstruction::Exit, + ], + instructions: vec![tfetch_w0, 0, 0, /* ALU triple */ 0, 0, 0], + }; + assert_eq!(tfetch_slots(&shader), vec![3]); + + // Flat shader: no fetch bits → no slots. + let flat = ParsedShader { + cf: vec![ControlFlowInstruction::Exec { + address: 0, + count: 1, + sequence: 0, + is_end: false, + predicated: false, + predicate_condition: false, + }], + instructions: vec![0, 0, 0], + }; + assert!(tfetch_slots(&flat).is_empty()); + } + /// P8: a shader containing `LoopStart` should mark `cf_loop` as used /// so the HUD can surface which deferred feature a game triggers. #[test] diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs index a211432..d0dcbd0 100644 --- a/crates/xenia-kernel/src/exports.rs +++ b/crates/xenia-kernel/src/exports.rs @@ -3116,27 +3116,27 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { ); ui.publish_assets(blobs, constants); - // P5: try to decode the primary texture (fetch constant slot 0). - // Slot 0 is the convention most games use for their main bound - // texture at draw time; full N-slot binding waits for P6+. If the - // slot is unset or the format isn't supported (magenta stub kicks - // in host-side), we skip. - // - // Texture fetch constants live at `CONST_BASE_FETCH + slot*6` in - // the register file; we read the 6 dwords, decode the key, hit - // the CPU cache (with page-version freshness), and clone the - // decoded bytes across the bridge. - const TEX_SLOT: u32 = 0; - let mut fetch6 = [0u32; 6]; - for (i, slot) in fetch6.iter_mut().enumerate() { - *slot = gpu_inline - .register_file - .read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32); - } - let published = if let Some(key) = xenia_gpu::texture_cache::decode_fetch_constant(fetch6) - { - // Span over the entire tiled texture footprint to pick the - // max page version covering it. + // P5b: publish the texture the last draw's *active pixel shader* + // actually sampled. The GPU draw handler decodes the PS's real + // `tfetch` fetch-constant slots into `last_draw_textures`; we publish + // the first (the UI binds a single texture today). When the last draw + // used a flat (no-tfetch) shader the list is empty, so we fall back to + // the legacy slot-0 probe to preserve behavior on flat-only frames. + let published = gpu_inline.last_draw_textures.first().cloned().or_else(|| { + // Fallback: probe fetch constant slot 0 directly. Texture fetch + // constants live at `CONST_BASE_FETCH + slot*6` in the register + // file; read 6 dwords, decode the key, hit the CPU cache with + // page-version freshness, clone the bytes across the bridge. + const TEX_SLOT: u32 = 0; + let mut fetch6 = [0u32; 6]; + for (i, slot) in fetch6.iter_mut().enumerate() { + *slot = gpu_inline + .register_file + .read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32); + } + let key = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)?; + // Span over the entire tiled texture footprint to pick the max + // page version covering it. let bi = key.format.block_info(); let span_bytes = (key.pitch_texels as u32) * (key.height as u32) @@ -3154,9 +3154,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { None } } - } else { - None - }; + }); metrics::gauge!("gpu.texture_cache.entries") .set(gpu_inline.texture_cache.len() as f64); ui.publish_texture(published);