xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view, ring_drain, pm4) drains the command processor; gpu_system owns the threaded backend (DrainFence RPC + parker/fence helpers from M1) and the MMIO-mapped register block (mmio_region). Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode the Xbox 360 microcode, translator.rs lowers it onto the WGSL xenos_interp interpreter shader (shaders/xenos_interp.wgsl). shader_metrics.rs counts decode/translate work. Render state: draw_state, primitive, render_target_cache, texture_cache, tiled_address (Xenos's swizzled tiled-memory layout), xenos_constants (register field constants), edram (the 10 MiB EDRAM model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs owns the typed GPU-resource handles the kernel hands out. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions
--- a/crates/xenia-gpu/Cargo.toml
+++ b/crates/xenia-gpu/Cargo.toml
@@ -11,3 +11,11 @@ tracing = { workspace = true }
 thiserror = { workspace = true }
 anyhow = { workspace = true }
 byteorder = { workspace = true }
 metrics = { workspace = true }
 bytemuck = { workspace = true }
 crossbeam-channel = { workspace = true }
 [dev-dependencies]
 # Used to validate bundled WGSL placeholders compile cleanly. Matches the
 # wgpu-22 transitive dep so we don't pull in a second naga version.
 naga = { version = "22", features = ["wgsl-in"] }
--- a/crates/xenia-gpu/src/draw_state.rs
+++ b/crates/xenia-gpu/src/draw_state.rs
--- a/crates/xenia-gpu/src/edram.rs
+++ b/crates/xenia-gpu/src/edram.rs
@@ -0,0 +1,506 @@
 //! CPU-side shadow of the Xenos GPU's 10 MiB EDRAM.
 //!
 //! The real console has 10 MiB of embedded DRAM organised as 2048 tiles,
 //! each 80 × 16 samples wide at 32 bits per sample (`xenos.h:223-285`,
 //! `kEdramTileCount = 2048`). 64-bpp formats pack two adjacent EDRAM tiles
 //! per color value.
 //!
 //! xenia-rs does not currently render through a real EDRAM (host draws go
 //! straight to wgpu attachments), but the resolve path still needs a
 //! concrete byte source. We keep a linear 10 MiB `Vec<u8>` here so:
 //!
 //! * clear-resolves can paint `RB_COLOR_CLEAR` / `RB_DEPTH_CLEAR` into the
 //!   source tiles, which the resolve loop then copies into guest memory
 //!   (this is the Sylpheed-first-pixels path);
 //! * future host→EDRAM readback code has a place to deposit pixels without
 //!   touching the resolve API.
 //!
 //! Byte layout inside one tile: row-major, `80 * 16 * bpp` bytes. At 32bpp,
 //! offset `= y * 80 * 4 + x * 4` from the tile base. Samples are stored in
 //! native-u32 byte order; any Xenon big-endian vs little-endian shuffling
 //! happens at the resolve write boundary, not inside EDRAM.
 //!
 //! Indexing wraps mod 2048 (`XE_GPU_REGISTER` `RB_COLOR_INFO.color_base` is
 //! 11-bit). Canary relies on this wraparound for tall surfaces that
 //! exceed the 10 MiB region.
 /// Number of tiles in EDRAM. `xenos::kEdramTileCount`.
 pub const EDRAM_TILE_COUNT: u32 = 2048;
 /// Samples per tile along X. `xenos::kEdramTileWidthSamples`.
 pub const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80;
 /// Samples per tile along Y. `xenos::kEdramTileHeightSamples`.
 pub const EDRAM_TILE_HEIGHT_SAMPLES: u32 = 16;
 /// Bytes per tile at 32bpp: 80 × 16 × 4 = 5120.
 pub const EDRAM_TILE_BYTES_32BPP: u32 =
    EDRAM_TILE_WIDTH_SAMPLES * EDRAM_TILE_HEIGHT_SAMPLES * 4;
 /// Bytes per tile at 64bpp: 80 × 16 × 8 = 10_240 (two adjacent 32bpp tiles).
 pub const EDRAM_TILE_BYTES_64BPP: u32 = EDRAM_TILE_BYTES_32BPP * 2;
 /// Total EDRAM size in bytes: 2048 × 5120 = 10_485_760 (exactly 10 MiB).
 pub const EDRAM_SIZE_BYTES: usize = (EDRAM_TILE_COUNT * EDRAM_TILE_BYTES_32BPP) as usize;
 /// 10 MiB shadow of the console's EDRAM. Owned by `GpuSystem` and lives for
 /// the lifetime of the GPU; no per-frame allocation.
 pub struct ShadowEdram {
    bytes: Vec<u8>,
 }
 impl Default for ShadowEdram {
    fn default() -> Self {
        Self::new()
    }
 }
 impl ShadowEdram {
    pub fn new() -> Self {
        Self {
            bytes: vec![0u8; EDRAM_SIZE_BYTES],
        }
    }
    /// Raw byte offset of a tile within the shadow buffer, wrapped mod 2048.
    #[inline]
    fn tile_byte_offset(tile_index: u32) -> usize {
        ((tile_index % EDRAM_TILE_COUNT) * EDRAM_TILE_BYTES_32BPP) as usize
    }
    pub fn as_bytes(&self) -> &[u8] {
        &self.bytes
    }
    pub fn tile(&self, tile_index: u32) -> &[u8] {
        let off = Self::tile_byte_offset(tile_index);
        &self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
    }
    pub fn tile_mut(&mut self, tile_index: u32) -> &mut [u8] {
        let off = Self::tile_byte_offset(tile_index);
        &mut self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
    }
    /// Sample-space byte offset within the shadow buffer for one 32bpp
    /// sample at `(x_samples, y_samples)` in a surface whose EDRAM origin
    /// is `base_tiles` and whose row pitch is `pitch_tiles` 32bpp tiles.
    ///
    /// Tile layout: a surface of pitch `P` tiles is laid out as a row of
    /// `P` tiles followed by the next 16-sample-tall row, etc. Sample
    /// `(x, y)` lives in tile `(y/16)*P + (x/80)`, at row `y % 16` and
    /// column `x % 80` within that tile.
    #[inline]
    fn sample_offset_32bpp(base_tiles: u16, pitch_tiles: u32, x: u32, y: u32) -> Option<usize> {
        if pitch_tiles == 0 {
            return None;
        }
        let tile_row = y / EDRAM_TILE_HEIGHT_SAMPLES;
        let tile_col = x / EDRAM_TILE_WIDTH_SAMPLES;
        let within_y = y % EDRAM_TILE_HEIGHT_SAMPLES;
        let within_x = x % EDRAM_TILE_WIDTH_SAMPLES;
        let tile_index =
            (base_tiles as u32).wrapping_add(tile_row * pitch_tiles + tile_col);
        let off = Self::tile_byte_offset(tile_index)
            + (within_y * EDRAM_TILE_WIDTH_SAMPLES * 4 + within_x * 4) as usize;
        Some(off)
    }
    /// Fill a `(w × h)`-sample rectangle at `(x, y)` with a constant 32bpp
    /// pattern. Coordinates are in *sample space* (already scaled through
    /// `sample_count_log2_x/y` for MSAA). Wraps mod 2048 tiles via
    /// `tile_byte_offset`.
    ///
    /// The pattern is written as host-native little-endian bytes — the
    /// endian swap in [`crate::resolve::apply_endian_128`] converts to the
    /// byte order expected by the destination.
    #[allow(clippy::too_many_arguments)]
    pub fn fill_rect_32bpp(
        &mut self,
        base_tiles: u16,
        pitch_tiles: u32,
        x: u32,
        y: u32,
        w: u32,
        h: u32,
        pattern: u32,
    ) {
        if w == 0 || h == 0 {
            return;
        }
        let le = pattern.to_le_bytes();
        for dy in 0..h {
            for dx in 0..w {
                if let Some(off) = Self::sample_offset_32bpp(
                    base_tiles,
                    pitch_tiles,
                    x + dx,
                    y + dy,
                ) && off + 4 <= self.bytes.len()
                {
                    self.bytes[off..off + 4].copy_from_slice(&le);
                }
            }
        }
    }
    /// Read one 32bpp sample at `(x, y)` in sample coordinates. Returns 0
    /// if the surface pitch is zero (degenerate; caller should skip the
    /// resolve).
    pub fn read_sample_32bpp(
        &self,
        base_tiles: u16,
        pitch_tiles: u32,
        x: u32,
        y: u32,
    ) -> u32 {
        match Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y) {
            Some(off) if off + 4 <= self.bytes.len() => u32::from_le_bytes([
                self.bytes[off],
                self.bytes[off + 1],
                self.bytes[off + 2],
                self.bytes[off + 3],
            ]),
            _ => 0,
        }
    }
    /// Write one 32bpp sample at `(x, y)` in sample coordinates. Mirror of
    /// [`Self::read_sample_32bpp`]. Used by the wgpu→ShadowEdram readback
    /// retile path and unit tests.
    pub fn write_sample_32bpp(
        &mut self,
        base_tiles: u16,
        pitch_tiles: u32,
        x: u32,
        y: u32,
        sample: u32,
    ) {
        if let Some(off) = Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y)
            && off + 4 <= self.bytes.len()
        {
            self.bytes[off..off + 4].copy_from_slice(&sample.to_le_bytes());
        }
    }
    /// Bulk write a `(w × h)`-sample rectangle at `(x, y)` from a row-major
    /// linear `samples` buffer. The buffer length must be at least `w * h`;
    /// extra entries are ignored. Order: `samples[dy * w + dx]` lands at
    /// (x + dx, y + dy). This is the format the wgpu→ShadowEdram readback
    /// path uses after stripping wgpu's 256-byte row alignment.
    #[allow(clippy::too_many_arguments)]
    pub fn write_rect_32bpp(
        &mut self,
        base_tiles: u16,
        pitch_tiles: u32,
        x: u32,
        y: u32,
        w: u32,
        h: u32,
        samples: &[u32],
    ) {
        if w == 0 || h == 0 {
            return;
        }
        let needed = (w as usize).saturating_mul(h as usize);
        debug_assert!(samples.len() >= needed, "write_rect_32bpp: samples too short");
        for dy in 0..h {
            let row_base = (dy as usize) * (w as usize);
            for dx in 0..w {
                let idx = row_base + dx as usize;
                if idx >= samples.len() {
                    return;
                }
                self.write_sample_32bpp(base_tiles, pitch_tiles, x + dx, y + dy, samples[idx]);
            }
        }
    }
    // --- 64bpp helpers ----------------------------------------------------
    //
    // 64bpp formats (`k_16_16_16_16`, `k_16_16_16_16_FLOAT`, `k_32_32_FLOAT`)
    // occupy two adjacent EDRAM tiles per logical tile, doubling the row
    // pitch in tiles. Per Canary `xenos.h:321-325 IsColorRenderTargetFormat64bpp`
    // and `draw_util.cc:1260-1262` (`pitch_tiles = surface_pitch_tiles << is_64bpp`).
    //
    // Convention: callers pass the *32bpp-equivalent* `base_tiles` and
    // `pitch_tiles_32bpp` (i.e. the `RB_COLOR_INFO.color_base` and
    // `surface_pitch_tiles` decoded from registers). The 64bpp helpers
    // multiply both by 2 internally so the lo/hi pair lands in adjacent
    // tiles. `lo` is the lower-addressed 32bpp word; `hi` is the upper.
    /// Read one 64bpp sample as `(lo, hi)` u32 pair. Doubled-tile addressing
    /// per Canary's `is_64bpp` convention.
    pub fn read_sample_64bpp(
        &self,
        base_tiles: u16,
        pitch_tiles_32bpp: u32,
        x: u32,
        y: u32,
    ) -> (u32, u32) {
        let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
        let base64 = (base_tiles as u32).saturating_mul(2) as u16;
        let lo = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2), y);
        let hi = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y);
        (lo, hi)
    }
    /// Write one 64bpp sample as `(lo, hi)` u32 pair.
    pub fn write_sample_64bpp(
        &mut self,
        base_tiles: u16,
        pitch_tiles_32bpp: u32,
        x: u32,
        y: u32,
        lo: u32,
        hi: u32,
    ) {
        let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
        let base64 = (base_tiles as u32).saturating_mul(2) as u16;
        self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2), y, lo);
        self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y, hi);
    }
    /// Bulk write a 64bpp rectangle from a row-major `(lo, hi)` linear
    /// buffer.
    #[allow(clippy::too_many_arguments)]
    pub fn write_rect_64bpp(
        &mut self,
        base_tiles: u16,
        pitch_tiles_32bpp: u32,
        x: u32,
        y: u32,
        w: u32,
        h: u32,
        samples: &[(u32, u32)],
    ) {
        if w == 0 || h == 0 {
            return;
        }
        for dy in 0..h {
            let row_base = (dy as usize) * (w as usize);
            for dx in 0..w {
                let idx = row_base + dx as usize;
                if idx >= samples.len() {
                    return;
                }
                let (lo, hi) = samples[idx];
                self.write_sample_64bpp(base_tiles, pitch_tiles_32bpp, x + dx, y + dy, lo, hi);
            }
        }
    }
    /// Fill a `(w × h)`-sample rectangle with a constant 64bpp pattern.
    /// `lo` lands at the low-addressed 32bpp word, `hi` at the high one
    /// — i.e. for clears, callers pass `(lo = RB_COLOR_CLEAR_LO,
    /// hi = RB_COLOR_CLEAR)` per Canary `draw_util.cc:1302-1303`.
    #[allow(clippy::too_many_arguments)]
    pub fn fill_rect_64bpp(
        &mut self,
        base_tiles: u16,
        pitch_tiles_32bpp: u32,
        x: u32,
        y: u32,
        w: u32,
        h: u32,
        lo: u32,
        hi: u32,
    ) {
        if w == 0 || h == 0 {
            return;
        }
        for dy in 0..h {
            for dx in 0..w {
                self.write_sample_64bpp(
                    base_tiles,
                    pitch_tiles_32bpp,
                    x + dx,
                    y + dy,
                    lo,
                    hi,
                );
            }
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn shadow_edram_is_exactly_10_mib() {
        assert_eq!(EDRAM_SIZE_BYTES, 10 * 1024 * 1024);
        let e = ShadowEdram::new();
        assert_eq!(e.as_bytes().len(), 10 * 1024 * 1024);
    }
    #[test]
    fn fill_rect_writes_the_whole_first_tile() {
        let mut e = ShadowEdram::new();
        e.fill_rect_32bpp(0, 1, 0, 0, 80, 16, 0x11223344);
        // Every 4-byte sample in tile 0 should be 0x11223344 (LE).
        let expected = 0x11223344u32.to_le_bytes();
        let tile = e.tile(0);
        for chunk in tile.chunks_exact(4) {
            assert_eq!(chunk, expected);
        }
    }
    #[test]
    fn fill_rect_respects_pitch_and_base() {
        let mut e = ShadowEdram::new();
        // Surface: pitch=2 tiles, base=5. A 160x16 fill should land in
        // tiles 5 and 6 — and leave tile 4 / tile 7 / tile 0 untouched.
        e.fill_rect_32bpp(5, 2, 0, 0, 160, 16, 0xAABBCCDD);
        let expected = 0xAABBCCDDu32.to_le_bytes();
        for chunk in e.tile(5).chunks_exact(4) {
            assert_eq!(chunk, expected);
        }
        for chunk in e.tile(6).chunks_exact(4) {
            assert_eq!(chunk, expected);
        }
        assert!(e.tile(4).iter().all(|&b| b == 0));
        assert!(e.tile(7).iter().all(|&b| b == 0));
        assert!(e.tile(0).iter().all(|&b| b == 0));
    }
    #[test]
    fn fill_rect_wraps_mod_2048() {
        let mut e = ShadowEdram::new();
        // base=2047, pitch=2: first tile is 2047, second wraps to 0.
        e.fill_rect_32bpp(2047, 2, 0, 0, 160, 16, 0xDEAD_BEEF);
        let expected = 0xDEAD_BEEFu32.to_le_bytes();
        for chunk in e.tile(2047).chunks_exact(4) {
            assert_eq!(chunk, expected);
        }
        for chunk in e.tile(0).chunks_exact(4) {
            assert_eq!(chunk, expected);
        }
    }
    #[test]
    fn read_sample_roundtrips_fill_rect() {
        let mut e = ShadowEdram::new();
        e.fill_rect_32bpp(3, 1, 0, 0, 80, 16, 0xCAFE_F00D);
        // Sample any interior point.
        assert_eq!(e.read_sample_32bpp(3, 1, 0, 0), 0xCAFE_F00D);
        assert_eq!(e.read_sample_32bpp(3, 1, 79, 15), 0xCAFE_F00D);
        // Untouched neighbouring tile.
        assert_eq!(e.read_sample_32bpp(4, 1, 0, 0), 0);
    }
    #[test]
    fn zero_pitch_is_a_noop_read() {
        let e = ShadowEdram::new();
        assert_eq!(e.read_sample_32bpp(0, 0, 10, 10), 0);
    }
    /// `write_sample_32bpp` round-trips through `read_sample_32bpp`.
    #[test]
    fn write_sample_32bpp_round_trips() {
        let mut e = ShadowEdram::new();
        for x in 0..80u32 {
            for y in 0..16u32 {
                e.write_sample_32bpp(0, 1, x, y, 0xABCD_0000 | (y << 8) | x);
            }
        }
        for x in 0..80u32 {
            for y in 0..16u32 {
                assert_eq!(
                    e.read_sample_32bpp(0, 1, x, y),
                    0xABCD_0000 | (y << 8) | x,
                    "round-trip mismatch at ({x},{y})"
                );
            }
        }
    }
    /// `write_rect_32bpp` writes row-major samples into the right
    /// sample-offsets, including across tile boundaries.
    #[test]
    fn write_rect_32bpp_crosses_tile_boundary() {
        let mut e = ShadowEdram::new();
        // Surface pitch = 2 tiles → x in [0, 160), y in [0, 16). A 100x4
        // rect at (40, 4) crosses x=80 (tile boundary).
        let w = 100u32;
        let h = 4u32;
        let mut samples = Vec::with_capacity((w * h) as usize);
        for dy in 0..h {
            for dx in 0..w {
                samples.push(0x10000 | (dy << 8) | dx);
            }
        }
        e.write_rect_32bpp(0, 2, 40, 4, w, h, &samples);
        // Spot-check: (40, 4) lands in tile 0; (140, 4) in tile 1.
        assert_eq!(e.read_sample_32bpp(0, 2, 40, 4), 0x1_0000);
        assert_eq!(
            e.read_sample_32bpp(0, 2, 139, 7),
            0x10000 | (3 << 8) | 99
        );
    }
    /// `read_sample_64bpp` round-trips through `write_sample_64bpp` —
    /// doubled-pitch addressing keeps lo/hi adjacent in EDRAM bytes.
    #[test]
    fn write_read_sample_64bpp_roundtrips() {
        let mut e = ShadowEdram::new();
        // Use 32bpp pitch=1, base=0 → 64bpp pitch=2, base=0. A single-tile
        // 64bpp surface fits 80x16 logical 64bpp samples? No — 80x16 32bpp
        // samples per tile, 80 logical 64bpp samples per *pair* of tiles,
        // and our 80×16 region needs 2 tiles. Stick to 16x4 logical 64bpp.
        for x in 0..16u32 {
            for y in 0..4u32 {
                e.write_sample_64bpp(0, 1, x, y, 0xAAAA_0000 | x, 0xBBBB_0000 | y);
            }
        }
        for x in 0..16u32 {
            for y in 0..4u32 {
                let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
                assert_eq!(lo, 0xAAAA_0000 | x);
                assert_eq!(hi, 0xBBBB_0000 | y);
            }
        }
    }
    /// `fill_rect_64bpp` writes both the lo and hi clear words across
    /// a 64bpp surface — matches the `RB_COLOR_CLEAR_LO`/`RB_COLOR_CLEAR`
    /// convention.
    #[test]
    fn fill_rect_64bpp_writes_both_words() {
        let mut e = ShadowEdram::new();
        // 16x4 logical 64bpp samples; pitch=1 32bpp tile → 2 64bpp tiles.
        e.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xCAFE_F00D, 0xDEAD_BEEF);
        for x in 0..16u32 {
            for y in 0..4u32 {
                let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
                assert_eq!(lo, 0xCAFE_F00D);
                assert_eq!(hi, 0xDEAD_BEEF);
            }
        }
    }
    /// 64bpp helpers must respect the doubled tile pitch — adjacent logical
    /// 64bpp samples must land at adjacent 32bpp samples in EDRAM.
    #[test]
    fn sixty_four_bpp_uses_doubled_pitch() {
        let mut e = ShadowEdram::new();
        e.write_sample_64bpp(0, 1, 5, 0, 0x1111_1111, 0x2222_2222);
        // The lo word must sit at 32bpp x=10 (5 << 1), hi at x=11.
        // Doubled pitch -> base=0, pitch=2 32bpp.
        assert_eq!(e.read_sample_32bpp(0, 2, 10, 0), 0x1111_1111);
        assert_eq!(e.read_sample_32bpp(0, 2, 11, 0), 0x2222_2222);
    }
    /// `write_rect_*` with empty dimensions is a no-op.
    #[test]
    fn write_rect_empty_is_noop() {
        let mut e = ShadowEdram::new();
        e.write_rect_32bpp(0, 1, 0, 0, 0, 5, &[1, 2, 3]);
        e.write_rect_32bpp(0, 1, 0, 0, 5, 0, &[1, 2, 3]);
        e.fill_rect_64bpp(0, 1, 0, 0, 0, 5, 1, 2);
        e.fill_rect_64bpp(0, 1, 0, 0, 5, 0, 1, 2);
        // Nothing should have been written.
        assert!(e.as_bytes().iter().all(|&b| b == 0));
    }
 }
--- a/crates/xenia-gpu/src/gpu_system.rs
+++ b/crates/xenia-gpu/src/gpu_system.rs
--- a/crates/xenia-gpu/src/handle.rs
+++ b/crates/xenia-gpu/src/handle.rs
--- a/crates/xenia-gpu/src/lib.rs
+++ b/crates/xenia-gpu/src/lib.rs
@@ -1,21 +1,49 @@
 //! Xenos GPU emulation for xenia-rs.
 //!
 //! Modules:
 //! - [`pm4`]: packet format decoder + Type-3 opcode set.
 //! - [`ring_view`]: ring-buffer bookkeeping (base/size/read/write pointers).
 //! - [`register_file`]: 0x6000-entry register array backing the CP + state.
 //! - [`gpu_system`]: top-level `GpuSystem` + PM4 executor running one packet
 //!   per call (see the plan's P2 for the design rationale).
 //!
 //! Legacy module `ring_drain` and `command_processor` are retained while P3+
 //! migrations finish; they will be removed once every caller is on
 //! [`gpu_system::GpuSystem`].
 pub mod command_processor;
 pub mod draw_state;
 pub mod edram;
 pub mod gpu_system;
 pub mod handle;
 pub mod mmio_region;
 pub mod pm4;
 pub mod primitive;
 pub mod register_file;
 pub mod ring_drain;
 pub mod ring_view;
 pub mod render_target_cache;
 pub mod resolve;
 pub mod shader_metrics;
 pub mod shaders;
 pub mod texture_cache;
 pub mod tiled_address;
 pub mod translator;
 pub mod ucode;
 pub mod xenos_constants;
-/// Stub GPU system for initial implementation.
+pub use gpu_system::{
-pub struct GpuSystem {
+    ExecOutcome, GpuBlock, GpuMmio, GpuStats, GpuSystem, InterruptSource, PendingInterrupt,
-    pub register_file: register_file::RegisterFile,
+    ShaderBlob, SwapNotification, WaitCmp,
-}
+};
-
+pub use handle::{
-impl GpuSystem {
+    DrainReply, GpuBackend, GpuCommand, GpuDigestSnapshot, GpuHandle, GpuWorker,
-    pub fn new() -> Self {
+    shutdown_and_join_with_timeout, spawn_gpu_worker, spawn_noop_worker,
-        Self {
+};
-            register_file: register_file::RegisterFile::new(),
+pub use mmio_region::build_region as build_mmio_region;
-        }
+pub use pm4::{
-    }
+    PacketHeader, PacketKind, PM4_INTERRUPT, PM4_NOP, PM4_XE_SWAP, SWAP_SIGNATURE,
-}
+    type3_opcode_name,
-
+};
-impl Default for GpuSystem {
+pub use ring_drain::{DrainResult, drain};
-    fn default() -> Self {
+pub use ring_view::RingBufferView;
        Self::new()
    }
 }
--- a/crates/xenia-gpu/src/mmio_region.rs
+++ b/crates/xenia-gpu/src/mmio_region.rs
@@ -0,0 +1,217 @@
 //! Construct an `xenia_memory::MmioRegion` that backs the Xenos GPU register
 //! aperture at guest physical `0x7FC80000` (per canary
 //! `graphics_system.cc:141-144` — `memory_->AddVirtualMappedRange(0x7FC80000,
 //! 0xFFFF0000, 0x0000FFFF, …)`).
 //!
 //! Only a handful of registers need a round-trip over the bus; everything
 //! else (the ALU / fetch constants, the RBBM state machine, …) lives inside
 //! `GpuSystem::register_file` and is driven by PM4 packets from the CP on
 //! the same host thread.
 //!
 //! The read/write closures capture `Arc<AtomicU32>` mailboxes cloned from
 //! [`crate::GpuMmio`]; [`crate::GpuSystem::sync_with_mmio`] samples them
 //! each scheduler round.
 use std::sync::atomic::Ordering;
 use xenia_memory::MmioRegion;
 use crate::gpu_system::{reg, GpuMmio};
 /// Xenos GPU register aperture base (guest physical address). Matches
 /// canary's `graphics_system.cc:141`.
 pub const APERTURE_BASE: u32 = 0x7FC8_0000;
 /// Mask used by `MmioRegion::contains` so any `0x7FC8xxxx` address hits.
 pub const APERTURE_MASK: u32 = 0xFFFF_0000;
 /// Total aperture size in bytes (enough for the low 16-bit register window).
 pub const APERTURE_SIZE: u32 = 0x0001_0000;
 /// Build the [`MmioRegion`] to install on the guest memory.
 pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
    let read_wptr = mmio.cp_rb_wptr.clone();
    let read_rptr = mmio.cp_rb_rptr.clone();
    let read_int_status = mmio.cp_int_status.clone();
    let read_int_ack = mmio.cp_int_ack.clone();
    let read_vblank_status = mmio.d1mode_vblank_vline_status.clone();
    let write_wptr = mmio.cp_rb_wptr.clone();
    let write_int_ack = mmio.cp_int_ack.clone();
    let write_vblank_status = mmio.d1mode_vblank_vline_status.clone();
    // M1.7 parker — captured into the WPTR write closure to wake a
    // parked GPU worker on every guest WPTR write. In inline mode the
    // mutex holds `None`, so the unpark site is a brief lock + no-op.
    let wake_pending = mmio.wake_pending.clone();
    let worker_thread = mmio.worker_thread.clone();
    MmioRegion {
        base_address: APERTURE_BASE,
        mask: APERTURE_MASK,
        size: APERTURE_SIZE,
        read_callback: Box::new(move |addr: u32| {
            let reg_index = (addr & 0xFFFF) / 4;
            match reg_index {
                reg::CP_RB_WPTR => read_wptr.load(Ordering::Relaxed),
                reg::CP_RB_RPTR => read_rptr.load(Ordering::Relaxed),
                reg::CP_INT_STATUS => read_int_status.load(Ordering::Relaxed),
                // Games sometimes read-back the ack register to check interrupt ownership
                // — serve the last-written value.
                reg::CP_INT_ACK => read_int_ack.load(Ordering::Relaxed),
                reg::D1MODE_VBLANK_VLINE_STATUS => {
                    read_vblank_status.load(Ordering::Relaxed)
                }
                _ => {
                    tracing::trace!(
                        reg = format_args!("{reg_index:#x}"),
                        addr = format_args!("{addr:#010x}"),
                        "gpu mmio: unmapped read (returning 0)"
                    );
                    0
                }
            }
        }),
        write_callback: Box::new(move |addr: u32, value: u32| {
            let reg_index = (addr & 0xFFFF) / 4;
            match reg_index {
                reg::CP_RB_WPTR => {
                    // Release: any prior writes to ring memory the guest
                    // performed before bumping WPTR must be visible to
                    // the GPU consumer that Acquire-loads this atomic.
                    write_wptr.store(value, Ordering::Release);
                    // M1.7 parker wake: set the pending bit (Release) so
                    // a worker swapping it on its way to `park_timeout`
                    // sees `was_pending == true` and skips the park; AND
                    // unpark the worker if it's already parked. Both are
                    // necessary to defend against the race window between
                    // the worker's `swap(false)` and `park_timeout()`.
                    wake_pending.store(true, Ordering::Release);
                    if let Ok(g) = worker_thread.lock() {
                        if let Some(t) = g.as_ref() {
                            t.unpark();
                        }
                    }
                    tracing::trace!(
                        value,
                        addr = format_args!("{addr:#010x}"),
                        "gpu mmio: CP_RB_WPTR write"
                    );
                }
                // CP_INT_ACK clears interrupt bits; we just echo the value.
                reg::CP_INT_ACK => {
                    write_int_ack.store(value, Ordering::Relaxed);
                }
                // D1MODE_VBLANK_VLINE_STATUS is write-1-to-clear per the
                // AMD M56 display-controller ref. Clear any bit the guest
                // writes a 1 to (leaving other bits untouched).
                reg::D1MODE_VBLANK_VLINE_STATUS => {
                    let prev = write_vblank_status.load(Ordering::Relaxed);
                    write_vblank_status.store(prev & !value, Ordering::Relaxed);
                }
                _ => {
                    tracing::trace!(
                        reg = format_args!("{reg_index:#x}"),
                        addr = format_args!("{addr:#010x}"),
                        value = format_args!("{value:#x}"),
                        "gpu mmio: unmapped write (dropping)"
                    );
                }
            }
        }),
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    fn build() -> (GpuMmio, MmioRegion) {
        let mmio = GpuMmio::new();
        let region = build_region(&mmio);
        (mmio, region)
    }
    /// `D1MODE_VBLANK_VLINE_STATUS` read must surface the atomic's current
    /// value — Sylpheed's graphics-interrupt callback reads bit 0 to decide
    /// whether vblank actually fired; if we always return 0 the callback
    /// silently skips every frame's work.
    #[test]
    fn vblank_status_read_returns_stored_value() {
        let (mmio, region) = build();
        mmio.d1mode_vblank_vline_status
            .store(0x1, Ordering::Relaxed);
        let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
        assert_eq!((region.read_callback)(offset), 0x1);
    }
    /// Guest clears the flag by writing 1 back. Classic write-1-to-clear —
    /// AMD M56 display-controller ref and Canary's behavior. We preserve
    /// unrelated bits so higher-bit status (VLINE_INT_OCCURRED etc.) can
    /// coexist with a concurrent clear of bit 0.
    #[test]
    fn vblank_status_write_1_to_clear() {
        let (mmio, region) = build();
        mmio.d1mode_vblank_vline_status
            .store(0b11, Ordering::Relaxed);
        let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
        (region.write_callback)(offset, 0b01);
        assert_eq!(
            mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
            0b10,
            "bit 0 cleared, bit 1 preserved"
        );
    }
    /// Write-0-to-a-bit must NOT clear that bit — classic W1TC semantics.
    #[test]
    fn vblank_status_write_0_is_noop() {
        let (mmio, region) = build();
        mmio.d1mode_vblank_vline_status
            .store(0b11, Ordering::Relaxed);
        let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
        (region.write_callback)(offset, 0x0);
        assert_eq!(
            mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
            0b11
        );
    }
    /// Regression: prior to the fix, `reg::CP_RB_WPTR` held a byte offset
    /// (`0x0714`) while the match arm compared against a *register index*
    /// (`(addr & 0xFFFF) / 4 == 0x01C5`). Guest MMIO writes to the WPTR
    /// therefore fell through to "unmapped" and the atomic never moved;
    /// only `VdInitializeRingBuffer` / `extend_write_ptr` paths worked.
    ///
    /// Verify every CP register lands in its atomic when the guest writes
    /// at the canonical `APERTURE_BASE + index*4` byte address.
    #[test]
    fn cp_rb_wptr_write_via_mmio_bus_reaches_atomic() {
        let (mmio, region) = build();
        let offset = APERTURE_BASE + reg::CP_RB_WPTR * 4;
        assert_eq!(offset, 0x7FC8_0714, "byte offset must match Canary CP_RB_WPTR");
        (region.write_callback)(offset, 0x1234_5678);
        assert_eq!(mmio.cp_rb_wptr.load(Ordering::Relaxed), 0x1234_5678);
    }
    #[test]
    fn cp_int_ack_write_via_mmio_bus_reaches_atomic() {
        let (mmio, region) = build();
        let offset = APERTURE_BASE + reg::CP_INT_ACK * 4;
        assert_eq!(offset, 0x7FC8_07D0, "byte offset must match Canary CP_INT_ACK");
        (region.write_callback)(offset, 0xDEAD_BEEF);
        assert_eq!(mmio.cp_int_ack.load(Ordering::Relaxed), 0xDEAD_BEEF);
    }
    #[test]
    fn cp_rb_rptr_read_via_mmio_bus_returns_atomic() {
        let (mmio, region) = build();
        mmio.cp_rb_rptr.store(0xCAFE_F00D, Ordering::Relaxed);
        let offset = APERTURE_BASE + reg::CP_RB_RPTR * 4;
        assert_eq!((region.read_callback)(offset), 0xCAFE_F00D);
    }
    #[test]
    fn cp_int_status_read_via_mmio_bus_returns_atomic() {
        let (mmio, region) = build();
        mmio.cp_int_status.store(0x0000_0001, Ordering::Relaxed);
        let offset = APERTURE_BASE + reg::CP_INT_STATUS * 4;
        assert_eq!((region.read_callback)(offset), 0x0000_0001);
    }
 }
--- a/crates/xenia-gpu/src/pm4.rs
+++ b/crates/xenia-gpu/src/pm4.rs
@@ -0,0 +1,232 @@
 //! PM4 packet format — header decoding + Type-3 opcode set.
 //!
 //! Xenos PM4 packet layout mirrors `xenia-canary/src/xenia/gpu/packet_disassembler.cc`:
 //!
 //! - **Type 0** (`packet >> 30 == 0`): register-write run.
 //!   `count = ((packet >> 16) & 0x3FFF) + 1`. Total dwords = `1 + count`.
 //!   With `(packet >> 15) & 1 == 1`, all writes target the same register.
 //! - **Type 1** (`packet >> 30 == 1`): two-register write. Total dwords = 3.
 //! - **Type 2** (`packet >> 30 == 2`): NOP — a single skipped dword.
 //! - **Type 3** (`packet >> 30 == 3`): command.
 //!   `opcode = (packet >> 8) & 0x7F`,
 //!   `count  = ((packet >> 16) & 0x3FFF) + 1`.
 //!   Total dwords = `1 + count`.
 /// The cookie canary writes alongside `PM4_XE_SWAP` so tooling can recognize
 /// swap packets. `'X','E','N','X'` big-endian (`kSwapSignature`).
 pub const SWAP_SIGNATURE: u32 = 0x584E_4558;
 // ── Named Type-3 opcodes (from xenia-canary/src/xenia/gpu/xenos.h:1617-1679) ──
 pub const PM4_ME_INIT: u8 = 0x48;
 pub const PM4_NOP: u8 = 0x10;
 pub const PM4_INDIRECT_BUFFER: u8 = 0x3F;
 pub const PM4_INDIRECT_BUFFER_PFD: u8 = 0x37;
 pub const PM4_WAIT_FOR_IDLE: u8 = 0x26;
 pub const PM4_WAIT_REG_MEM: u8 = 0x3C;
 pub const PM4_REG_RMW: u8 = 0x21;
 pub const PM4_REG_TO_MEM: u8 = 0x3E;
 pub const PM4_MEM_WRITE: u8 = 0x3D;
 pub const PM4_COND_WRITE: u8 = 0x45;
 pub const PM4_EVENT_WRITE: u8 = 0x46;
 pub const PM4_EVENT_WRITE_SHD: u8 = 0x58;
 pub const PM4_EVENT_WRITE_EXT: u8 = 0x5A;
 pub const PM4_EVENT_WRITE_ZPD: u8 = 0x5B;
 pub const PM4_DRAW_INDX: u8 = 0x22;
 pub const PM4_DRAW_INDX_2: u8 = 0x36;
 pub const PM4_VIZ_QUERY: u8 = 0x23;
 pub const PM4_SET_CONSTANT: u8 = 0x2D;
 pub const PM4_SET_CONSTANT2: u8 = 0x55;
 pub const PM4_SET_SHADER_CONSTANTS: u8 = 0x56;
 pub const PM4_LOAD_ALU_CONSTANT: u8 = 0x2F;
 pub const PM4_IM_LOAD: u8 = 0x27;
 pub const PM4_IM_LOAD_IMMEDIATE: u8 = 0x2B;
 pub const PM4_LOAD_CONSTANT_CONTEXT: u8 = 0x2E;
 pub const PM4_INVALIDATE_STATE: u8 = 0x3B;
 pub const PM4_INTERRUPT: u8 = 0x54;
 pub const PM4_SET_SHADER_BASES: u8 = 0x4A;
 pub const PM4_SET_BIN_MASK_LO: u8 = 0x60;
 pub const PM4_SET_BIN_MASK_HI: u8 = 0x61;
 pub const PM4_SET_BIN_SELECT_LO: u8 = 0x62;
 pub const PM4_SET_BIN_SELECT_HI: u8 = 0x63;
 pub const PM4_SET_BIN_MASK: u8 = 0x50;
 pub const PM4_SET_BIN_SELECT: u8 = 0x51;
 pub const PM4_CONTEXT_UPDATE: u8 = 0x5E;
 /// Xenia-specific: `VdSwap` writes this to trigger a present.
 pub const PM4_XE_SWAP: u8 = 0x64;
 /// Human-readable name for a Type-3 opcode. Used for tracing spans.
 pub fn type3_opcode_name(op: u8) -> &'static str {
    match op {
        PM4_ME_INIT => "ME_INIT",
        PM4_NOP => "NOP",
        PM4_INDIRECT_BUFFER => "INDIRECT_BUFFER",
        PM4_INDIRECT_BUFFER_PFD => "INDIRECT_BUFFER_PFD",
        PM4_WAIT_FOR_IDLE => "WAIT_FOR_IDLE",
        PM4_WAIT_REG_MEM => "WAIT_REG_MEM",
        PM4_REG_RMW => "REG_RMW",
        PM4_REG_TO_MEM => "REG_TO_MEM",
        PM4_MEM_WRITE => "MEM_WRITE",
        PM4_COND_WRITE => "COND_WRITE",
        PM4_EVENT_WRITE => "EVENT_WRITE",
        PM4_EVENT_WRITE_SHD => "EVENT_WRITE_SHD",
        PM4_EVENT_WRITE_EXT => "EVENT_WRITE_EXT",
        PM4_EVENT_WRITE_ZPD => "EVENT_WRITE_ZPD",
        PM4_DRAW_INDX => "DRAW_INDX",
        PM4_DRAW_INDX_2 => "DRAW_INDX_2",
        PM4_VIZ_QUERY => "VIZ_QUERY",
        PM4_SET_CONSTANT => "SET_CONSTANT",
        PM4_SET_CONSTANT2 => "SET_CONSTANT2",
        PM4_SET_SHADER_CONSTANTS => "SET_SHADER_CONSTANTS",
        PM4_LOAD_ALU_CONSTANT => "LOAD_ALU_CONSTANT",
        PM4_LOAD_CONSTANT_CONTEXT => "LOAD_CONSTANT_CONTEXT",
        PM4_IM_LOAD => "IM_LOAD",
        PM4_IM_LOAD_IMMEDIATE => "IM_LOAD_IMMEDIATE",
        PM4_INVALIDATE_STATE => "INVALIDATE_STATE",
        PM4_INTERRUPT => "INTERRUPT",
        PM4_SET_SHADER_BASES => "SET_SHADER_BASES",
        PM4_SET_BIN_MASK_LO => "SET_BIN_MASK_LO",
        PM4_SET_BIN_MASK_HI => "SET_BIN_MASK_HI",
        PM4_SET_BIN_SELECT_LO => "SET_BIN_SELECT_LO",
        PM4_SET_BIN_SELECT_HI => "SET_BIN_SELECT_HI",
        PM4_SET_BIN_MASK => "SET_BIN_MASK",
        PM4_SET_BIN_SELECT => "SET_BIN_SELECT",
        PM4_CONTEXT_UPDATE => "CONTEXT_UPDATE",
        PM4_XE_SWAP => "XE_SWAP",
        _ => "UNKNOWN",
    }
 }
 /// Decoded single PM4 packet header.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct PacketHeader {
    pub kind: PacketKind,
    /// Total size of the packet (including header) in dwords.
    pub total_dwords: u32,
 }
 /// Classification of a PM4 packet.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum PacketKind {
    /// Type-0 register-write run. `base_index` is the first register index
    /// (the register offset / 4). `write_one` is true if all `count` data
    /// dwords write to the same register.
    Type0 {
        base_index: u32,
        count: u32,
        write_one: bool,
    },
    /// Type-1 two-register write.
    Type1 { reg_index_1: u32, reg_index_2: u32 },
    /// Type-2 NOP (a single skipped dword).
    Type2,
    /// Type-3 command.
    Type3 {
        opcode: u8,
        count: u32,
        predicated: bool,
    },
 }
 /// Decode a single PM4 packet header.
 pub fn decode(header: u32) -> PacketHeader {
    match header >> 30 {
        0 => {
            let count = ((header >> 16) & 0x3FFF) + 1;
            PacketHeader {
                kind: PacketKind::Type0 {
                    base_index: header & 0x7FFF,
                    count,
                    write_one: (header >> 15) & 1 != 0,
                },
                total_dwords: 1 + count,
            }
        }
        1 => PacketHeader {
            kind: PacketKind::Type1 {
                reg_index_1: header & 0x7FF,
                reg_index_2: (header >> 11) & 0x7FF,
            },
            total_dwords: 3,
        },
        2 => PacketHeader {
            kind: PacketKind::Type2,
            total_dwords: 1,
        },
        3 => {
            let count = ((header >> 16) & 0x3FFF) + 1;
            PacketHeader {
                kind: PacketKind::Type3 {
                    opcode: ((header >> 8) & 0x7F) as u8,
                    count,
                    predicated: (header & 1) != 0,
                },
                total_dwords: 1 + count,
            }
        }
        _ => unreachable!(),
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn type2_is_one_dword() {
        // 0x80000000 == type 2 header (bits 31:30 = 10)
        let hdr = decode(0x8000_0000);
        assert_eq!(hdr.kind, PacketKind::Type2);
        assert_eq!(hdr.total_dwords, 1);
    }
    #[test]
    fn type0_count_is_inclusive() {
        // count field (bits 29:16) = 5 → 6 data dwords. base_index = 0x100.
        // write_one = 0.
        let hdr = decode((5 << 16) | 0x100);
        match hdr.kind {
            PacketKind::Type0 {
                base_index,
                count,
                write_one,
            } => {
                assert_eq!(base_index, 0x100);
                assert_eq!(count, 6);
                assert!(!write_one);
            }
            _ => panic!("expected Type0"),
        }
        assert_eq!(hdr.total_dwords, 7);
    }
    #[test]
    fn type3_swap_packet() {
        // Build the exact header canary's VdSwap emits:
        //   MakePacketType3(PM4_XE_SWAP, 4) → ((3<<30) | ((4-1)<<16) | (0x64<<8))
        let hdr_word = (3u32 << 30) | ((4u32 - 1) << 16) | ((PM4_XE_SWAP as u32) << 8);
        let hdr = decode(hdr_word);
        match hdr.kind {
            PacketKind::Type3 {
                opcode,
                count,
                predicated,
            } => {
                assert_eq!(opcode, PM4_XE_SWAP);
                assert_eq!(count, 4);
                assert!(!predicated);
            }
            _ => panic!("expected Type3"),
        }
        assert_eq!(hdr.total_dwords, 5);
    }
    #[test]
    fn opcode_names_are_present_for_common_ops() {
        assert_eq!(type3_opcode_name(PM4_NOP), "NOP");
        assert_eq!(type3_opcode_name(PM4_DRAW_INDX), "DRAW_INDX");
        assert_eq!(type3_opcode_name(PM4_XE_SWAP), "XE_SWAP");
        assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
        assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
    }
 }
--- a/crates/xenia-gpu/src/primitive.rs
+++ b/crates/xenia-gpu/src/primitive.rs
@@ -0,0 +1,229 @@
 //! Primitive processor — normalize Xenos primitives into host-GPU forms.
 //!
 //! wgpu only exposes `PrimitiveTopology::{PointList, LineList, LineStrip,
 //! TriangleList, TriangleStrip}`. For everything else (fans, quads,
 //! rectangles) we rewrite indices on the CPU side so the host just sees a
 //! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
 //!
 //! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
 //! (list, strip, fan). Rectangle + quad expansions are stubs logged via
 //! `tracing::warn!` for later.
 use crate::draw_state::{IndexSize, PrimitiveType};
 /// Host primitive topology — a subset of wgpu's that we commit to.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum HostTopology {
    PointList,
    LineList,
    LineStrip,
    TriangleList,
    TriangleStrip,
 }
 /// Result of primitive processing.
 #[derive(Debug, Clone)]
 pub struct ProcessedPrimitive {
    pub topology: HostTopology,
    /// When the Xenos primitive needed client-side rewriting (fans, quads),
    /// this buffer holds the rewritten 16-bit or 32-bit index sequence.
    /// `None` means the input index buffer is usable as-is.
    pub rewritten_indices: Option<Vec<u32>>,
    /// Post-processing vertex count — equals the input count when indices
    /// pass through unchanged.
    pub host_vertex_count: u32,
    /// `true` if we rejected the primitive (unsupported shape) and the
    /// caller should skip this draw. Logged via `tracing::warn!`.
    pub rejected: bool,
 }
 /// Normalize a draw.
 ///
 /// `indices` is `None` for `AutoIndex` draws; otherwise it's the decoded
 /// index stream (already endian-converted / widened to u32 by the caller).
 pub fn process(
    primitive: PrimitiveType,
    vertex_count: u32,
    indices: Option<&[u32]>,
 ) -> ProcessedPrimitive {
    match primitive {
        PrimitiveType::PointList => pass_through(HostTopology::PointList, vertex_count),
        PrimitiveType::LineList => pass_through(HostTopology::LineList, vertex_count),
        PrimitiveType::LineStrip => pass_through(HostTopology::LineStrip, vertex_count),
        PrimitiveType::TriangleList => pass_through(HostTopology::TriangleList, vertex_count),
        PrimitiveType::TriangleStrip => pass_through(HostTopology::TriangleStrip, vertex_count),
        PrimitiveType::TriangleFan => expand_fan(indices, vertex_count),
        PrimitiveType::RectangleList => expand_rectangles(indices, vertex_count),
        PrimitiveType::QuadList => expand_quads(indices, vertex_count),
        PrimitiveType::None | PrimitiveType::Unknown(_) => {
            tracing::warn!(?primitive, "gpu: rejecting unsupported primitive");
            metrics::counter!("gpu.primitive.rejected").increment(1);
            ProcessedPrimitive {
                topology: HostTopology::TriangleList,
                rewritten_indices: None,
                host_vertex_count: 0,
                rejected: true,
            }
        }
    }
 }
 fn pass_through(topology: HostTopology, vertex_count: u32) -> ProcessedPrimitive {
    ProcessedPrimitive {
        topology,
        rewritten_indices: None,
        host_vertex_count: vertex_count,
        rejected: false,
    }
 }
 /// Convert a triangle fan to a triangle list. Fan indices `[0, 1, 2, 3, 4]`
 /// expand to triangles `(0,1,2), (0,2,3), (0,3,4)` — 3 × (n-2) host indices.
 fn expand_fan(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
    if vertex_count < 3 {
        return ProcessedPrimitive {
            topology: HostTopology::TriangleList,
            rewritten_indices: Some(Vec::new()),
            host_vertex_count: 0,
            rejected: false,
        };
    }
    let mut out = Vec::with_capacity(3 * (vertex_count as usize - 2));
    let get = |i: u32| -> u32 {
        match indices {
            Some(buf) => buf[i as usize],
            None => i,
        }
    };
    let apex = get(0);
    for i in 1..vertex_count.saturating_sub(1) {
        out.push(apex);
        out.push(get(i));
        out.push(get(i + 1));
    }
    let host_vertex_count = out.len() as u32;
    ProcessedPrimitive {
        topology: HostTopology::TriangleList,
        rewritten_indices: Some(out),
        host_vertex_count,
        rejected: false,
    }
 }
 /// Convert a quad list (groups of 4) to a triangle list (groups of 6).
 fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
    let quad_count = vertex_count / 4;
    let mut out = Vec::with_capacity(6 * quad_count as usize);
    let get = |i: u32| -> u32 {
        match indices {
            Some(buf) => buf[i as usize],
            None => i,
        }
    };
    for q in 0..quad_count {
        let base = q * 4;
        let a = get(base);
        let b = get(base + 1);
        let c = get(base + 2);
        let d = get(base + 3);
        out.extend_from_slice(&[a, b, c, a, c, d]);
    }
    let host_vertex_count = out.len() as u32;
    ProcessedPrimitive {
        topology: HostTopology::TriangleList,
        rewritten_indices: Some(out),
        host_vertex_count,
        rejected: false,
    }
 }
 /// Rectangle lists: a Xenos-specific primitive where each group of 3
 /// vertices defines a right-angle rectangle by its three non-repeated
 /// corners (the 4th is derived). The uber-shader doesn't support this yet;
 /// the ucode translator will emulate it as a geometry-stage fake. For P3
 /// we emit an empty draw.
 fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
    tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
    metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
    ProcessedPrimitive {
        topology: HostTopology::TriangleList,
        rewritten_indices: Some(Vec::new()),
        host_vertex_count: 0,
        rejected: true,
    }
 }
 /// Widen a u16 index buffer to u32. The primitive processor normalizes to
 /// u32 so downstream wgpu pipeline descriptors stay simple.
 pub fn widen_indices(raw: &[u8], size: IndexSize, count: u32) -> Vec<u32> {
    let mut out = Vec::with_capacity(count as usize);
    match size {
        IndexSize::Sixteen => {
            for i in 0..count as usize {
                let off = i * 2;
                if off + 2 > raw.len() {
                    break;
                }
                // Xenos indices are big-endian on the wire.
                let be = u16::from_be_bytes([raw[off], raw[off + 1]]);
                out.push(be as u32);
            }
        }
        IndexSize::ThirtyTwo => {
            for i in 0..count as usize {
                let off = i * 4;
                if off + 4 > raw.len() {
                    break;
                }
                let be = u32::from_be_bytes([raw[off], raw[off + 1], raw[off + 2], raw[off + 3]]);
                out.push(be);
            }
        }
    }
    out
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn triangle_list_passes_through() {
        let p = process(PrimitiveType::TriangleList, 6, None);
        assert_eq!(p.topology, HostTopology::TriangleList);
        assert!(p.rewritten_indices.is_none());
        assert_eq!(p.host_vertex_count, 6);
        assert!(!p.rejected);
    }
    #[test]
    fn fan_to_list_expands_correctly() {
        // Fan of 5 vertices → triangles (0,1,2), (0,2,3), (0,3,4)
        let p = process(PrimitiveType::TriangleFan, 5, None);
        let idx = p.rewritten_indices.unwrap();
        assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 0, 3, 4]);
        assert_eq!(p.topology, HostTopology::TriangleList);
        assert_eq!(p.host_vertex_count, 9);
    }
    #[test]
    fn quad_list_expansion() {
        let p = process(PrimitiveType::QuadList, 8, None);
        let idx = p.rewritten_indices.unwrap();
        assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
    }
    #[test]
    fn widen_u16_indices_big_endian() {
        // 3 indices [1, 2, 0x1234] in BE u16.
        let raw = [0, 1, 0, 2, 0x12, 0x34];
        let out = widen_indices(&raw, IndexSize::Sixteen, 3);
        assert_eq!(out, vec![1, 2, 0x1234]);
    }
    #[test]
    fn rejects_unknown_primitive() {
        let p = process(PrimitiveType::Unknown(0x2A), 3, None);
        assert!(p.rejected);
    }
 }
--- a/crates/xenia-gpu/src/render_target_cache.rs
+++ b/crates/xenia-gpu/src/render_target_cache.rs
@@ -0,0 +1,384 @@
 //! EDRAM tile book + render-target key bookkeeping.
 //!
 //! Mirrors `xenia-canary/src/xenia/gpu/render_target_cache.h` at the data-
 //! structure level. Xenos's 10 MiB EDRAM is divided into 2048 "tiles" of
 //! 80×16 samples each; render targets claim a contiguous range of those
 //! tiles based on `(base_tiles, pitch_tiles_at_32bpp, msaa_samples, format,
 //! is_depth)`. Two render targets with overlapping tile ranges share the
 //! underlying EDRAM — canary tracks this with per-tile "Host vs Shared"
 //! ownership, which is what this module's `TileOwner` captures.
 //!
 //! P4 ships the **bookkeeping**. Actual host texture allocation per key (so
 //! the host can draw into a wgpu texture matching the guest's RT) is left to
 //! a future host-side cache built on top of this module; the same for
 //! format-conversion compute shaders (the plan's P5 territory).
 use std::collections::HashMap;
 /// Number of EDRAM tiles on Xenos. Matches canary's `xenos::kEdramTileCount`.
 pub const EDRAM_TILE_COUNT: usize = 2048;
 /// MSAA sample count encoded into [`RenderTargetKey`]. Canary uses this as
 /// `xenos::MsaaSamples` (1×/2×/4×).
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum MsaaSamples {
    X1 = 0,
    X2 = 1,
    X4 = 2,
 }
 impl MsaaSamples {
    pub fn from_raw(raw: u32) -> Self {
        match raw & 0x3 {
            1 => MsaaSamples::X2,
            2 => MsaaSamples::X4,
            _ => MsaaSamples::X1,
        }
    }
    pub fn count(self) -> u32 {
        1u32 << (self as u32)
    }
 }
 /// The packed EDRAM render-target identity. Bit layout matches
 /// `render_target_cache.h:251-321`'s `RenderTargetKey` union (26 bits used,
 /// stored as a single `u32` so it hashes cheaply). `pitch_tiles_at_32bpp`
 /// is always the 32bpp-equivalent pitch — 64bpp targets halve their tile
 /// pitch from the nominal tile grid (canary's `GetPitchTiles()` handles
 /// that).
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub struct RenderTargetKey {
    pub base_tiles: u16,              // [0..2048)
    pub pitch_tiles_at_32bpp: u16,    // 0..=256 in practice
    pub msaa_samples: MsaaSamples,
    pub is_depth: bool,
    /// Color format: `xenos::ColorRenderTargetFormat` when !is_depth.
    /// Depth format: `xenos::DepthRenderTargetFormat` when is_depth.
    pub resource_format: u8,          // 4 bits
 }
 impl RenderTargetKey {
    /// Pack into canary's 26-bit layout. Useful for compact storage /
    /// hashing when we add a LRU cache later on.
    pub fn pack(&self) -> u32 {
        (self.base_tiles as u32 & 0x7FF)
            | (((self.pitch_tiles_at_32bpp as u32) & 0xFF) << 11)
            | (((self.msaa_samples as u32) & 0x3) << 19)
            | ((self.is_depth as u32) << 21)
            | (((self.resource_format as u32) & 0xF) << 22)
    }
    pub fn unpack(raw: u32) -> Self {
        Self {
            base_tiles: (raw & 0x7FF) as u16,
            pitch_tiles_at_32bpp: ((raw >> 11) & 0xFF) as u16,
            msaa_samples: MsaaSamples::from_raw((raw >> 19) & 0x3),
            is_depth: ((raw >> 21) & 1) != 0,
            resource_format: ((raw >> 22) & 0xF) as u8,
        }
    }
    /// How many EDRAM tiles the whole surface occupies (rough estimate; a
    /// real height-aware calc needs viewport info). We conservatively use
    /// `pitch_tiles_at_32bpp * 1` until a draw tells us otherwise; callers
    /// that know the height can call [`tile_footprint_with_height`].
    pub fn tile_pitch(&self) -> u16 {
        // 64bpp formats pack two 32bpp tiles into one 64bpp tile.
        if self.is_64bpp() {
            self.pitch_tiles_at_32bpp / 2
        } else {
            self.pitch_tiles_at_32bpp
        }
    }
    pub fn is_64bpp(&self) -> bool {
        if self.is_depth {
            false
        } else {
            // Canary: `ColorRenderTargetFormat::{k_16_16_16_16,
            // k_16_16_16_16_FLOAT, k_32_32_FLOAT}` are 64bpp; indices 4, 5, 7
            // in the enum. (Kept narrow because the enum is 4 bits wide.)
            matches!(self.resource_format, 4 | 5 | 7)
        }
    }
    /// Tiles claimed by this RT if its surface height is `rows_of_tiles`
    /// (i.e. `ceil(height_in_samples / 16)`).
    pub fn tile_footprint_with_height(&self, rows_of_tiles: u16) -> u16 {
        self.tile_pitch().saturating_mul(rows_of_tiles)
    }
 }
 /// Who currently owns a tile of EDRAM.
 ///
 /// `None`: untouched; free to claim.
 /// `Host(idx)`: a single RT has exclusive ownership.
 /// `Shared(idx)`: two+ RT keys map to the same tile (usually after a
 /// format change without an intervening clear); the named RT is the most
 /// recent owner whose format should be honored for readback.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[derive(Default)]
 pub enum TileOwner {
    #[default]
    None,
    Host(u32),
    Shared(u32),
 }
 /// Bookkeeping across the 2048 EDRAM tiles. Not a GPU resource by itself —
 /// tracks which render target (by index) currently owns each tile.
 pub struct EdramTileBook {
    tiles: Vec<TileOwner>,
 }
 impl Default for EdramTileBook {
    fn default() -> Self {
        Self::new()
    }
 }
 impl EdramTileBook {
    pub fn new() -> Self {
        Self {
            tiles: vec![TileOwner::None; EDRAM_TILE_COUNT],
        }
    }
    pub fn who_owns(&self, tile: u16) -> TileOwner {
        self.tiles
            .get(tile as usize)
            .copied()
            .unwrap_or(TileOwner::None)
    }
    /// Mark `[base, base+count)` as owned by `rt_idx`. Pre-existing owners
    /// in the range are demoted to `Shared` (format reinterpretation).
    /// Returns the number of tiles newly claimed (not previously the same
    /// owner).
    pub fn claim(&mut self, base: u16, count: u16, rt_idx: u32) -> u32 {
        let mut newly_claimed = 0u32;
        for i in 0..(count as usize) {
            let t = base as usize + i;
            if t >= self.tiles.len() {
                break;
            }
            let prev = self.tiles[t];
            let already_ours = matches!(
                prev,
                TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
            );
            match prev {
                TileOwner::None => {
                    self.tiles[t] = TileOwner::Host(rt_idx);
                }
                TileOwner::Host(idx) if idx == rt_idx => {
                    // re-claim of same RT — no-op
                }
                _ => {
                    // Format change / shared range.
                    self.tiles[t] = TileOwner::Shared(rt_idx);
                }
            }
            if !already_ours {
                newly_claimed += 1;
            }
        }
        newly_claimed
    }
    /// Drop `rt_idx` from any tile it owns; tiles revert to `None` unless
    /// they were `Shared(rt_idx)` (in which case they also revert to
    /// `None`; the other sharer's ownership is lost — `release` is a
    /// coarse "this RT is gone" operation).
    pub fn release(&mut self, rt_idx: u32) {
        for t in self.tiles.iter_mut() {
            if matches!(
                *t,
                TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
            ) {
                *t = TileOwner::None;
            }
        }
    }
    /// Count tiles currently assigned to any RT (Host or Shared).
    pub fn occupied_count(&self) -> u32 {
        self.tiles
            .iter()
            .filter(|o| !matches!(o, TileOwner::None))
            .count() as u32
    }
 }
 /// Minimal per-RT descriptor stored alongside the tile book. P5's texture
 /// cache will expand this with the actual wgpu texture handle.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct RtDescriptor {
    pub key: RenderTargetKey,
    /// Number of times this key has been bound since creation. Rough
    /// proxy for activity / hot-RT identification.
    pub bind_count: u32,
    /// Draw index on first bind — handy for debugging divergence.
    pub first_draw_index: u32,
 }
 /// Top-level cache: maps packed keys to small descriptors + the tile book.
 pub struct RenderTargetCache {
    next_idx: u32,
    by_key: HashMap<u32, u32>,
    descriptors: HashMap<u32, RtDescriptor>,
    pub tiles: EdramTileBook,
 }
 impl Default for RenderTargetCache {
    fn default() -> Self {
        Self::new()
    }
 }
 impl RenderTargetCache {
    pub fn new() -> Self {
        Self {
            next_idx: 0,
            by_key: HashMap::new(),
            descriptors: HashMap::new(),
            tiles: EdramTileBook::new(),
        }
    }
    /// Look up or allocate an RT descriptor for `key`. `draw_index` is the
    /// current monotonic draw counter — recorded on first insert for
    /// provenance.
    pub fn bind(&mut self, key: RenderTargetKey, draw_index: u32) -> u32 {
        let packed = key.pack();
        if let Some(&idx) = self.by_key.get(&packed) {
            if let Some(d) = self.descriptors.get_mut(&idx) {
                d.bind_count += 1;
            }
            return idx;
        }
        let idx = self.next_idx;
        self.next_idx += 1;
        self.by_key.insert(packed, idx);
        self.descriptors.insert(
            idx,
            RtDescriptor {
                key,
                bind_count: 1,
                first_draw_index: draw_index,
            },
        );
        idx
    }
    pub fn descriptor(&self, idx: u32) -> Option<&RtDescriptor> {
        self.descriptors.get(&idx)
    }
    pub fn len(&self) -> usize {
        self.descriptors.len()
    }
    pub fn is_empty(&self) -> bool {
        self.descriptors.is_empty()
    }
    /// Claim tiles for the descriptor at `rt_idx`. `height_tiles` is
    /// `ceil(viewport_height_samples / 16)` — callers supply it because
    /// the key itself doesn't carry height.
    pub fn claim_tiles(&mut self, rt_idx: u32, height_tiles: u16) -> u32 {
        if let Some(d) = self.descriptors.get(&rt_idx) {
            let footprint = d.key.tile_footprint_with_height(height_tiles);
            self.tiles.claim(d.key.base_tiles, footprint, rt_idx)
        } else {
            0
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn render_target_key_pack_roundtrip() {
        let k = RenderTargetKey {
            base_tiles: 1600,
            pitch_tiles_at_32bpp: 80,
            msaa_samples: MsaaSamples::X4,
            is_depth: true,
            resource_format: 0b1010,
        };
        let packed = k.pack();
        let round = RenderTargetKey::unpack(packed);
        assert_eq!(round, k);
    }
    #[test]
    fn tile_book_claim_marks_owners() {
        let mut book = EdramTileBook::new();
        assert_eq!(book.occupied_count(), 0);
        let new_count = book.claim(100, 10, 42);
        assert_eq!(new_count, 10);
        assert_eq!(book.who_owns(100), TileOwner::Host(42));
        assert_eq!(book.who_owns(109), TileOwner::Host(42));
        assert_eq!(book.who_owns(110), TileOwner::None);
    }
    #[test]
    fn tile_book_claim_demotes_to_shared() {
        let mut book = EdramTileBook::new();
        book.claim(100, 10, 1);
        book.claim(105, 10, 2);
        // Overlap: tiles 105..110 should be Shared(2); 100..105 stay Host(1);
        // tiles 110..115 are fresh Host(2).
        assert_eq!(book.who_owns(104), TileOwner::Host(1));
        assert_eq!(book.who_owns(105), TileOwner::Shared(2));
        assert_eq!(book.who_owns(110), TileOwner::Host(2));
    }
    #[test]
    fn tile_book_release_frees_all() {
        let mut book = EdramTileBook::new();
        book.claim(0, 50, 7);
        book.release(7);
        assert_eq!(book.occupied_count(), 0);
    }
    #[test]
    fn rt_cache_bind_is_idempotent_by_key() {
        let mut cache = RenderTargetCache::new();
        let k = RenderTargetKey {
            base_tiles: 0,
            pitch_tiles_at_32bpp: 80,
            msaa_samples: MsaaSamples::X1,
            is_depth: false,
            resource_format: 0,
        };
        let a = cache.bind(k, 0);
        let b = cache.bind(k, 1);
        assert_eq!(a, b);
        let d = cache.descriptor(a).unwrap();
        assert_eq!(d.bind_count, 2);
        assert_eq!(d.first_draw_index, 0);
    }
    #[test]
    fn rt_cache_claim_tiles_tracks_footprint() {
        let mut cache = RenderTargetCache::new();
        let k = RenderTargetKey {
            base_tiles: 0,
            pitch_tiles_at_32bpp: 80, // 32bpp 1280-wide target
            msaa_samples: MsaaSamples::X1,
            is_depth: false,
            resource_format: 0,
        };
        let idx = cache.bind(k, 0);
        // 720 samples tall / 16 per tile = 45 rows → 80 * 45 = 3600 tiles;
        // caps out at 2048. Verify clamping.
        let newly = cache.claim_tiles(idx, 45);
        assert_eq!(newly, 2048);
        assert_eq!(cache.tiles.occupied_count(), 2048);
    }
 }
--- a/crates/xenia-gpu/src/resolve.rs
+++ b/crates/xenia-gpu/src/resolve.rs
--- a/crates/xenia-gpu/src/ring_drain.rs
+++ b/crates/xenia-gpu/src/ring_drain.rs
@@ -0,0 +1,169 @@
 //! Ring-buffer drainer.
 //!
 //! Walks a guest PM4 ring buffer from `start_offset` forward, classifying each
 //! packet via [`crate::pm4`] and stopping when it either reaches the end of
 //! the window it was asked to scan, walks off a NOP-fill region, or hits a
 //! malformed header.
 //!
 //! It does **not** execute draws — that's deferred to a later phase. Its job
 //! is to (a) advance the read pointer far enough that games keep making
 //! progress, and (b) surface `PM4_XE_SWAP` packets so `VdSwap` can forward
 //! them to the host UI.
 use xenia_memory::MemoryAccess;
 use crate::pm4::{self, PacketKind};
 /// Outcome of a [`drain`] call.
 #[derive(Default, Debug, Clone, Copy)]
 pub struct DrainResult {
    /// Dword offset reached, relative to the start of the ring buffer.
    pub new_offset: u32,
    /// How many packets were walked in this call.
    pub packets_walked: u32,
    /// True if we saw `PM4_XE_SWAP` during the walk.
    pub swap_seen: bool,
    /// If `swap_seen`, the guest frontbuffer *physical* address written next
    /// to `PM4_XE_SWAP` (dword 2 of the 4-payload packet).
    pub swap_frontbuffer_phys: u32,
    /// If `swap_seen`, the width written at dword 3.
    pub swap_width: u32,
    /// If `swap_seen`, the height written at dword 4.
    pub swap_height: u32,
 }
 /// Walk `max_packets` packets starting at dword offset `start_offset` in the
 /// ring buffer at guest address `ring_base` of size `ring_size_dwords`.
 ///
 /// The offset is treated modulo `ring_size_dwords`. Walking stops when:
 /// - `max_packets` have been walked,
 /// - a `PM4_XE_SWAP` has been consumed (the swap is reported and we stop so
 ///   the UI sees the frame boundary before further drain),
 /// - a header's declared total size would exceed the remaining budget,
 /// - the ring size is zero (drainer is a no-op).
 pub fn drain<M: MemoryAccess + ?Sized>(
    mem: &M,
    ring_base: u32,
    ring_size_dwords: u32,
    start_offset: u32,
    max_packets: u32,
 ) -> DrainResult {
    if ring_size_dwords == 0 || ring_base == 0 {
        return DrainResult::default();
    }
    let mut result = DrainResult {
        new_offset: start_offset % ring_size_dwords,
        ..DrainResult::default()
    };
    let mut offset = result.new_offset;
    for _ in 0..max_packets {
        let header_addr = ring_base.wrapping_add(offset.wrapping_mul(4));
        let header = mem.read_u32(header_addr);
        let packet = pm4::decode(header);
        // Refuse to walk past the ring in a single packet.
        if packet.total_dwords > ring_size_dwords {
            break;
        }
        // Type-3 PM4_XE_SWAP → record payload and stop.
        if let PacketKind::Type3 { opcode, .. } = packet.kind
            && opcode == pm4::PM4_XE_SWAP {
                // Payload layout (from canary VdSwap_entry):
                //   [0] XE_SWAP header
                //   [1] kSwapSignature ("XNEX" = 0x584E4558)
                //   [2] frontbuffer physical address
                //   [3] width
                //   [4] height
                let payload = |i: u32| {
                    let addr =
                        ring_base.wrapping_add(((offset + i) % ring_size_dwords).wrapping_mul(4));
                    mem.read_u32(addr)
                };
                result.swap_seen = true;
                result.swap_frontbuffer_phys = payload(2);
                result.swap_width = payload(3);
                result.swap_height = payload(4);
                offset = (offset + packet.total_dwords) % ring_size_dwords;
                result.new_offset = offset;
                result.packets_walked += 1;
                return result;
            }
        offset = (offset + packet.total_dwords) % ring_size_dwords;
        result.new_offset = offset;
        result.packets_walked += 1;
    }
    result
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use xenia_memory::GuestMemory;
    use xenia_memory::page_table::MemoryProtect;
    fn build_mem() -> GuestMemory {
        let mut mem = GuestMemory::new().unwrap();
        let rw = MemoryProtect::READ | MemoryProtect::WRITE;
        mem.alloc(0x4000_0000, 0x1000, rw).unwrap();
        mem
    }
    fn write_dword(mem: &GuestMemory, addr: u32, val: u32) {
        mem.write_u32(addr, val);
    }
    #[test]
    fn walks_nops_until_budget_exhausted() {
        let mut mem = build_mem();
        // Fill 10 dwords with Type-2 NOPs.
        for i in 0..10 {
            write_dword(&mut mem, 0x4000_0000 + i * 4, 0x8000_0000);
        }
        let r = drain(&mem, 0x4000_0000, 0x400, 0, 5);
        assert_eq!(r.packets_walked, 5);
        assert_eq!(r.new_offset, 5);
        assert!(!r.swap_seen);
    }
    #[test]
    fn stops_at_swap_and_reports_payload() {
        let mut mem = build_mem();
        // Two NOPs, then a PM4_XE_SWAP packet.
        write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
        write_dword(&mut mem, 0x4000_0004, 0x8000_0000);
        // MakePacketType3(PM4_XE_SWAP, 4) → (3<<30) | (3<<16) | (0x64<<8)
        let swap_hdr = (3u32 << 30) | (3u32 << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
        write_dword(&mut mem, 0x4000_0008, swap_hdr);
        write_dword(&mut mem, 0x4000_000C, pm4::SWAP_SIGNATURE);
        write_dword(&mut mem, 0x4000_0010, 0xDEAD_F000); // frontbuffer phys
        write_dword(&mut mem, 0x4000_0014, 1280);
        write_dword(&mut mem, 0x4000_0018, 720);
        let r = drain(&mem, 0x4000_0000, 0x400, 0, 16);
        assert!(r.swap_seen);
        assert_eq!(r.swap_frontbuffer_phys, 0xDEAD_F000);
        assert_eq!(r.swap_width, 1280);
        assert_eq!(r.swap_height, 720);
        assert_eq!(r.packets_walked, 3);
        assert_eq!(r.new_offset, 7); // 2 NOPs (1 dword each) + 5-dword swap = 7
    }
    #[test]
    fn wraps_around_ring() {
        let mut mem = build_mem();
        // Ring size = 4 dwords. Start at offset 3 (last dword). Write a NOP
        // there, then the walker should wrap to offset 0.
        write_dword(&mut mem, 0x4000_000C, 0x8000_0000);
        write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
        let r = drain(&mem, 0x4000_0000, 4, 3, 2);
        assert_eq!(r.packets_walked, 2);
        assert_eq!(r.new_offset, 1);
    }
    #[test]
    fn zero_ring_size_is_noop() {
        let mem = build_mem();
        let r = drain(&mem, 0x4000_0000, 0, 0, 10);
        assert_eq!(r.packets_walked, 0);
        assert_eq!(r.new_offset, 0);
        assert!(!r.swap_seen);
    }
 }
--- a/crates/xenia-gpu/src/ring_view.rs
+++ b/crates/xenia-gpu/src/ring_view.rs
@@ -0,0 +1,123 @@
 //! Primary ring buffer view.
 //!
 //! Games allocate a ring buffer in physical memory (via
 //! `MmAllocatePhysicalMemoryEx` with WRITE_COMBINE), then hand the base
 //! address + log2(size) to `VdInitializeRingBuffer`. They subsequently push
 //! PM4 packets into it, advancing the write-pointer by writing to a GPU
 //! register (`CP_RB_WPTR`) or via kernel-call shims.
 //!
 //! The GPU consumes packets from `read_offset_dwords` up to (but not past)
 //! the write pointer. After consuming enough bytes it writes `read_offset`
 //! into the guest-memory address registered by `VdEnableRingBufferRPtrWriteBack`
 //! so the game can know how much of the ring has been consumed.
 /// Tracks the primary ring buffer as set up by the guest.
 #[derive(Debug, Clone, Copy, Default)]
 pub struct RingBufferView {
    /// Guest physical/virtual base address. `0` means uninitialized.
    pub base: u32,
    /// Size of the ring in dwords. `0` means uninitialized.
    pub size_dwords: u32,
    /// Dword offset the GPU has consumed up to (relative to `base`).
    pub read_offset_dwords: u32,
    /// Dword offset the guest has last written into (relative to `base`).
    /// Updated either by an MMIO write to `CP_RB_WPTR` or by the kernel
    /// (`VdSwap` is a hint — the game reserves a 64-dword slot in the ring
    /// for it).
    pub write_offset_dwords: u32,
    /// Guest address where we mirror `read_offset_dwords` each time we make
    /// progress. `0` if the game never called `VdEnableRingBufferRPtrWriteBack`.
    pub rptr_writeback_addr: u32,
    /// Write-back block granularity in dwords (from the `log2` arg to
    /// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so
    /// we don't actually use this for scheduling — kept for observability.
    pub rptr_writeback_block_dwords: u32,
 }
 impl RingBufferView {
    pub fn new() -> Self {
        Self::default()
    }
    /// True if the guest has provided a base + size.
    pub fn is_initialized(&self) -> bool {
        self.base != 0 && self.size_dwords != 0
    }
    /// True if there is pending unread data to consume.
    pub fn has_pending(&self) -> bool {
        self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords
    }
    /// Number of dwords we can consume without wrapping past the write ptr.
    pub fn pending_dwords(&self) -> u32 {
        if !self.is_initialized() {
            return 0;
        }
        if self.write_offset_dwords >= self.read_offset_dwords {
            self.write_offset_dwords - self.read_offset_dwords
        } else {
            // write has wrapped — we can read up to the end of the ring.
            self.size_dwords - self.read_offset_dwords
        }
    }
    /// Advance the read pointer by `dwords`, wrapping at `size_dwords`.
    pub fn advance_read(&mut self, dwords: u32) {
        if self.size_dwords == 0 {
            return;
        }
        self.read_offset_dwords =
            (self.read_offset_dwords + dwords) % self.size_dwords;
    }
    /// Guest address for the dword at relative offset `i` from the current
    /// read pointer. `None` if uninitialized.
    pub fn addr_at_offset(&self, offset_dwords: u32) -> Option<u32> {
        if !self.is_initialized() {
            return None;
        }
        let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords;
        Some(self.base.wrapping_add(off.wrapping_mul(4)))
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn uninitialized_view_reports_empty() {
        let v = RingBufferView::new();
        assert!(!v.is_initialized());
        assert!(!v.has_pending());
        assert_eq!(v.pending_dwords(), 0);
    }
    #[test]
    fn wrap_around_arithmetic() {
        let mut v = RingBufferView::new();
        v.base = 0x4000_0000;
        v.size_dwords = 16;
        v.read_offset_dwords = 14;
        v.write_offset_dwords = 2; // wrapped
        // We can only read to end-of-ring in one chunk.
        assert_eq!(v.pending_dwords(), 2);
        v.advance_read(2);
        assert_eq!(v.read_offset_dwords, 0);
        // Now unwrapped, 2 more to go.
        assert_eq!(v.pending_dwords(), 2);
    }
    #[test]
    fn addr_at_offset_wraps() {
        let mut v = RingBufferView::new();
        v.base = 0x4000_0000;
        v.size_dwords = 4;
        v.read_offset_dwords = 3;
        assert_eq!(v.addr_at_offset(0), Some(0x4000_000C));
        assert_eq!(v.addr_at_offset(1), Some(0x4000_0000));
        assert_eq!(v.addr_at_offset(2), Some(0x4000_0004));
    }
 }
--- a/crates/xenia-gpu/src/shader_metrics.rs
+++ b/crates/xenia-gpu/src/shader_metrics.rs
@@ -0,0 +1,350 @@
 //! Host-side static analysis over a [`ParsedShader`], emitted once per unique
 //! shader blob. Produces the observability the plan's P3b/P3c sections call
 //! for (`gpu.shader.interpret{stage,kind}` + `gpu.shader.reject{reason}`), so
 //! the HUD can show when a game is reaching ops the WGSL interpreter falls
 //! back on.
 //!
 //! Analysis is intentionally cheap: it scans each exec clause's instruction
 //! triples, classifies them as ALU / vertex-fetch / texture-fetch using the
 //! owning clause's sequence bitmap, and bumps counters accordingly. No GPU
 //! readback is required — `reject` reasons are inferred from opcode values
 //! alone.
 use metrics::counter;
 use crate::ucode::alu::{decode_alu, sop, vop};
 use crate::ucode::control_flow::ControlFlowInstruction;
 use crate::ucode::fetch::{FetchInstruction, decode_fetch};
 use crate::ucode::ParsedShader;
 /// Walk `parsed` once and emit `gpu.shader.interpret` + `gpu.shader.reject`
 /// counters. `stage` should be `"vs"` or `"ps"`.
 pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
    let mut alu_count: u64 = 0;
    let mut vfetch_count: u64 = 0;
    let mut tfetch_count: u64 = 0;
    let mut rejects: Vec<(&'static str, u64)> = Vec::new();
    let mut features: Vec<&'static str> = Vec::new();
    for clause in &parsed.cf {
        match clause {
            ControlFlowInstruction::Exec {
                address,
                count,
                sequence,
                ..
            } => {
                for i in 0..(*count as usize) {
                    let triple_idx = *address as usize + i;
                    let base = triple_idx * 3;
                    if base + 2 >= parsed.instructions.len() {
                        break;
                    }
                    let words = [
                        parsed.instructions[base],
                        parsed.instructions[base + 1],
                        parsed.instructions[base + 2],
                    ];
                    // sequence bit layout: 2 bits per triple, hi bit = is-fetch.
                    let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
                    if is_fetch {
                        match decode_fetch(words) {
                            FetchInstruction::Vertex(_) => vfetch_count += 1,
                            FetchInstruction::Texture(tf) => {
                                tfetch_count += 1;
                                match tf.dimension {
                                    0 => mark_feature(&mut features, "tfetch_1d"),
                                    2 => mark_feature(&mut features, "tfetch_3d"),
                                    3 => mark_feature(&mut features, "tfetch_cube"),
                                    _ => {}
                                }
                                if tf.dimension != 1 {
                                    bump(&mut rejects, "texfetch_dimension");
                                }
                            }
                            FetchInstruction::Unknown { .. } => {
                                bump(&mut rejects, "fetch_unknown");
                            }
                        }
                    } else {
                        alu_count += 1;
                        let alu = decode_alu(words);
                        if !vec_op_supported(alu.vector_opcode) {
                            bump(&mut rejects, "alu_vec_unsupported");
                        }
                        if !scl_op_supported(alu.scalar_opcode) {
                            bump(&mut rejects, "alu_scl_unsupported");
                        }
                        // Feature-of-interest detection for future phases.
                        // Transcendentals + kill + setp + cube/max4 are the
                        // high-value signals: they tell us which of the
                        // deferred capabilities Sylpheed actually exercises.
                        match alu.vector_opcode {
                            v if v == vop::CUBE => mark_feature(&mut features, "vec_cube"),
                            v if v == vop::MAX4 => mark_feature(&mut features, "vec_max4"),
                            v if v == vop::KILL_EQ
                                || v == vop::KILL_GT
                                || v == vop::KILL_GE
                                || v == vop::KILL_NE =>
                            {
                                mark_feature(&mut features, "vec_kill");
                            }
                            v if v == vop::CND_EQ
                                || v == vop::CND_GE
                                || v == vop::CND_GT =>
                            {
                                mark_feature(&mut features, "vec_cnd");
                            }
                            _ => {}
                        }
                        match alu.scalar_opcode {
                            s if s == sop::EXP
                                || s == sop::LOG
                                || s == sop::LOGC
                                || s == sop::SIN
                                || s == sop::COS =>
                            {
                                mark_feature(&mut features, "scl_transcendental");
                            }
                            s if s == sop::RSQ
                                || s == sop::RSQC
                                || s == sop::RSQF
                                || s == sop::SQRT =>
                            {
                                mark_feature(&mut features, "scl_sqrt_family");
                            }
                            s if s == sop::SETP_EQ
                                || s == sop::SETP_NE
                                || s == sop::SETP_GT
                                || s == sop::SETP_GE
                                || s == sop::SETP_INV
                                || s == sop::SETP_POP
                                || s == sop::SETP_CLR
                                || s == sop::SETP_RSTR =>
                            {
                                mark_feature(&mut features, "scl_setp");
                            }
                            s if s == sop::KILLS_EQ
                                || s == sop::KILLS_GT
                                || s == sop::KILLS_GE
                                || s == sop::KILLS_NE
                                || s == sop::KILLS_ONE =>
                            {
                                mark_feature(&mut features, "scl_kills");
                            }
                            _ => {}
                        }
                        if alu.predicated {
                            mark_feature(&mut features, "alu_predicated");
                        }
                    }
                }
            }
            ControlFlowInstruction::LoopStart { .. }
            | ControlFlowInstruction::LoopEnd { .. } => {
                mark_feature(&mut features, "cf_loop");
                bump(&mut rejects, "cf_loop");
            }
            ControlFlowInstruction::CondJmp { .. } => {
                mark_feature(&mut features, "cf_cond_jmp");
                bump(&mut rejects, "cf_cond_jmp");
            }
            ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
                mark_feature(&mut features, "cf_call_return");
                bump(&mut rejects, "cf_call_return");
            }
            ControlFlowInstruction::Unknown { .. } => {
                bump(&mut rejects, "cf_unknown");
            }
            _ => {}
        }
    }
    counter!("gpu.shader.interpret", "stage" => stage, "kind" => "alu")
        .increment(alu_count);
    counter!("gpu.shader.interpret", "stage" => stage, "kind" => "vfetch")
        .increment(vfetch_count);
    counter!("gpu.shader.interpret", "stage" => stage, "kind" => "tfetch")
        .increment(tfetch_count);
    for (reason, n) in rejects {
        counter!("gpu.shader.reject", "stage" => stage, "reason" => reason).increment(n);
    }
    for name in features {
        counter!("gpu.feature.used", "stage" => stage, "name" => name).increment(1);
    }
 }
 fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
    if !buf.contains(&name) {
        buf.push(name);
    }
 }
 fn bump(buf: &mut Vec<(&'static str, u64)>, reason: &'static str) {
    for entry in buf.iter_mut() {
        if entry.0 == reason {
            entry.1 += 1;
            return;
        }
    }
    buf.push((reason, 1));
 }
 fn vec_op_supported(op: u8) -> bool {
    matches!(
        op,
        vop::ADD
            | vop::MUL
            | vop::MAX
            | vop::MIN
            | vop::SEQ
            | vop::SGT
            | vop::SGE
            | vop::SNE
            | vop::FRC
            | vop::TRUNC
            | vop::FLOOR
            | vop::MAD
            | vop::CND_EQ
            | vop::CND_GE
            | vop::CND_GT
            | vop::DOT4
            | vop::DOT3
            | vop::DOT2_ADD
            | vop::MAX4
            | vop::KILL_EQ
            | vop::KILL_GT
            | vop::KILL_GE
            | vop::KILL_NE
            | vop::DST
    )
 }
 fn scl_op_supported(op: u8) -> bool {
    matches!(
        op,
        sop::ADDS
            | sop::ADDS_PREV
            | sop::MULS
            | sop::MULS_PREV
            | sop::MAXS
            | sop::MINS
            | sop::SEQS
            | sop::SGTS
            | sop::SGES
            | sop::SNES
            | sop::FRCS
            | sop::TRUNCS
            | sop::FLOORS
            | sop::EXP
            | sop::LOG
            | sop::LOGC
            | sop::RCP
            | sop::RCPC
            | sop::RCPF
            | sop::RSQ
            | sop::RSQC
            | sop::RSQF
            | sop::SQRT
            | sop::SUBS
            | sop::SUBS_PREV
            | sop::SETP_EQ
            | sop::SETP_NE
            | sop::SETP_GT
            | sop::SETP_GE
            | sop::SETP_INV
            | sop::SETP_POP
            | sop::SETP_CLR
            | sop::SETP_RSTR
            | sop::KILLS_EQ
            | sop::KILLS_GT
            | sop::KILLS_GE
            | sop::KILLS_NE
            | sop::KILLS_ONE
            | sop::SIN
            | sop::COS
            | sop::RETAIN_PREV
    )
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::ucode::alu::{sop, vop};
    use crate::ucode::control_flow::ControlFlowInstruction;
    /// Build a minimal `ParsedShader` with one `Exec` clause containing
    /// `count` ALU triples and assert the `alu` counter path works.
    #[test]
    fn emit_for_runs_on_synthetic_shader() {
        let alu_w2 = (vop::ADD as u32) | ((sop::ADDS as u32) << 6) | (0xF << 12);
        let shader = ParsedShader {
            cf: vec![
                ControlFlowInstruction::Exec {
                    address: 0,
                    count: 2,
                    sequence: 0, // all ALU (no is-fetch bits)
                    is_end: false,
                    predicated: false,
                    predicate_condition: false,
                },
                ControlFlowInstruction::Exit,
            ],
            instructions: vec![0, 0, alu_w2, 0, 0, alu_w2],
        };
        // Just smoke: doesn't panic. Counters are validated via metrics
        // exporters elsewhere; we only assert this doesn't throw on a
        // well-formed ParsedShader.
        emit_for(&shader, "vs");
    }
    /// P8: a shader containing `LoopStart` should mark `cf_loop` as used
    /// so the HUD can surface which deferred feature a game triggers.
    #[test]
    fn feature_detection_flags_loops_and_kills() {
        let kill_alu_w2 =
            (vop::KILL_EQ as u32) | ((sop::RETAIN_PREV as u32) << 6) | (0xF << 12);
        let shader = ParsedShader {
            cf: vec![
                ControlFlowInstruction::LoopStart {
                    address: 0,
                    loop_id: 0,
                },
                ControlFlowInstruction::Exec {
                    address: 0,
                    count: 1,
                    sequence: 0,
                    is_end: true,
                    predicated: false,
                    predicate_condition: false,
                },
            ],
            instructions: vec![0, 0, kill_alu_w2],
        };
        // Smoke: emits cleanly.
        emit_for(&shader, "ps");
    }
    #[test]
    fn unsupported_ops_classified_as_rejects() {
        // Opcode 63 is outside our supported sets for both pipes.
        let alu_w2 = 63u32 | (63u32 << 6) | (0xF << 12);
        let shader = ParsedShader {
            cf: vec![
                ControlFlowInstruction::Exec {
                    address: 0,
                    count: 1,
                    sequence: 0,
                    is_end: true,
                    predicated: false,
                    predicate_condition: false,
                },
            ],
            instructions: vec![0, 0, alu_w2],
        };
        // Again: smoke — but also confirm our static tables reject op 63.
        assert!(!vec_op_supported(63));
        assert!(!scl_op_supported(63));
        emit_for(&shader, "ps");
    }
 }
--- a/crates/xenia-gpu/src/shaders/mod.rs
+++ b/crates/xenia-gpu/src/shaders/mod.rs
@@ -0,0 +1,36 @@
 //! Embedded WGSL shader sources used by the host pipeline.
 /// Xenos uber-shader scaffold (P3). See the comment at the top of
 /// [`XENOS_INTERP_WGSL`] for scope/limits and the plan's P3b target state.
 pub const XENOS_INTERP_WGSL: &str = include_str!("xenos_interp.wgsl");
 #[cfg(test)]
 mod tests {
    use super::*;
    /// Parsing through naga validates the shader against WGSL spec + wgpu's
    /// type system. We don't need a full pipeline to catch typos and layout
    /// mistakes — this test is fast and catches regressions at `cargo test`
    /// time.
    #[test]
    fn xenos_interp_wgsl_parses() {
        let module = naga::front::wgsl::parse_str(XENOS_INTERP_WGSL)
            .expect("xenos_interp.wgsl must parse cleanly");
        // Sanity: we declared two entry points.
        assert!(!module.entry_points.is_empty());
        assert!(
            module
                .entry_points
                .iter()
                .any(|e| e.name == "vs_main" && e.stage == naga::ShaderStage::Vertex),
            "missing vs_main entry"
        );
        assert!(
            module
                .entry_points
                .iter()
                .any(|e| e.name == "fs_main" && e.stage == naga::ShaderStage::Fragment),
            "missing fs_main entry"
        );
    }
 }
--- a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl
+++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl
@@ -0,0 +1,974 @@
 // xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
 //
 // Bindings (stable across P3b milestones):
 //   @group(0) @binding(0) draw_ctx      (uniform, 16 B — XenosDrawConstants)
 //   @group(0) @binding(1) xenos_consts  (uniform, ~9.2 KB — XenosConstants)
 //   @group(0) @binding(2) vs_ucode      (storage<read>, packed VS shader)
 //   @group(0) @binding(3) ps_ucode      (storage<read>, packed PS shader)
 //   @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
 //
 // Packed shader layout (both vs_ucode & ps_ucode):
 //   [0]                            = cf_count
 //   [1 .. 1 + cf_count*3]          = CF table: (kind, primary, aux) × cf_count
 //   [1 + cf_count*3 ..]            = instruction triples (3 dwords each)
 //
 // M3 state (this file): CF walker + operand decode helpers + register file
 // scaffold are complete. ALU / fetch bodies are still stubs that fall back
 // to the procedural-circle visualisation; M4-M7 fill them in.
 struct XenosDrawConstants {
    draw_index: u32,
    vertex_count: u32,
    prim_kind: u32,
    _pad: u32,
 };
 struct XenosConstants {
    alu:          array<vec4<f32>, 512>,
    fetch:        array<u32, 256>,
    bool_consts:  array<u32, 8>,
    loop_consts:  array<u32, 32>,
 };
@group(0) @binding(0) var<uniform>            draw_ctx      : XenosDrawConstants;
 // `xenos_consts` is a read-only storage buffer (not uniform) because the
 // block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
 // address space requires 16-byte element stride, which would triple the
 // allocation; storage accepts the natural 4-byte stride.
@group(0) @binding(1) var<storage, read>      xenos_consts  : XenosConstants;
@group(0) @binding(2) var<storage, read>      vs_ucode      : array<u32>;
@group(0) @binding(3) var<storage, read>      ps_ucode      : array<u32>;
@group(0) @binding(4) var<storage, read>      vertex_buffer : array<u32>;
 // M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
 // all texture slots; the P5 texture cache will replace this with per-slot
 // bindings.
@group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
@group(1) @binding(1) var xenos_samp : sampler;
 // ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
 const CF_KIND_EXEC:        u32 = 0u;
 const CF_KIND_EXEC_END:    u32 = 1u;
 const CF_KIND_ALLOC:       u32 = 2u;
 const CF_KIND_EXIT:        u32 = 3u;
 const CF_KIND_LOOP_START:  u32 = 4u;
 const CF_KIND_LOOP_END:    u32 = 5u;
 const CF_KIND_COND_JMP:    u32 = 6u;
 const CF_KIND_COND_CALL:   u32 = 7u;
 const CF_KIND_RETURN:      u32 = 8u;
 const CF_KIND_UNKNOWN:     u32 = 15u;
 // ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
 const ALLOC_KIND_POSITION:      u32 = 0u;
 const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
 const ALLOC_KIND_COLORS:        u32 = 2u;
 // Per-invocation Xenos register file + scalar `ps` + predicate.
 var<private> registers: array<vec4<f32>, 128>;
 var<private> ps: f32;
 var<private> predicate: bool;
 // Currently-active export alloc kind; set by Alloc clauses.
 var<private> current_alloc: u32;
 // P3c additions:
 //   `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
 //                 interpreter exits. (`discard` inside a helper function is
 //                 allowed in WGSL, but keeping it at the entry level makes
 //                 control flow easier to read.)
 //   `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
 //                 clauses. Xenos supports up to 4 nested loops.
 //   `reject_mask` — bitfield of op categories we failed to interpret, so the
 //                 PS fallback color + host-side diagnostics can surface it.
 var<private> kill_flag: bool;
 var<private> loop_depth: u32;
 var<private> loop_counters: array<u32, 4>;
 var<private> loop_starts: array<u32, 4>;
 var<private> reject_mask: u32;
 const REJECT_ALU_VEC:      u32 = 1u;
 const REJECT_ALU_SCL:      u32 = 2u;
 const REJECT_TEX_NON2D:    u32 = 4u;
 const REJECT_VFETCH_FMT:   u32 = 8u;
 const REJECT_CF_JUMP:      u32 = 16u;
 const REJECT_CF_CALL:      u32 = 32u;
 const REJECT_LOOP_OVERFLOW:u32 = 64u;
 struct VsOut {
    @builtin(position) position: vec4<f32>,
    @location(0) color: vec4<f32>,
 };
 struct FsOut {
    @location(0) color0: vec4<f32>,
 };
 // ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.
 fn vs_cf_count() -> u32 { return vs_ucode[0]; }
 fn ps_cf_count() -> u32 { return ps_ucode[0]; }
 fn vs_cf_kind(i: u32) -> u32    { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
 fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
 fn vs_cf_aux(i: u32) -> u32     { return vs_ucode[1u + i * 3u + 2u]; }
 fn ps_cf_kind(i: u32) -> u32    { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
 fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
 fn ps_cf_aux(i: u32) -> u32     { return ps_ucode[1u + i * 3u + 2u]; }
 fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
 fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }
 // Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
 fn vs_instr_dword(t: u32, which: u32) -> u32 {
    return vs_ucode[vs_instr_base() + t * 3u + which];
 }
 fn ps_instr_dword(t: u32, which: u32) -> u32 {
    return ps_ucode[ps_instr_base() + t * 3u + which];
 }
 // ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
 fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
    // Swizzle is 8 bits: 2 bits per output lane, xyzw order.
    let sx = (swizzle >> 0u) & 3u;
    let sy = (swizzle >> 2u) & 3u;
    let sz = (swizzle >> 4u) & 3u;
    let sw = (swizzle >> 6u) & 3u;
    return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
 }
 fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
    var r = v;
    if take_abs {
        r = abs(r);
    }
    if negate {
        r = -r;
    }
    return r;
 }
 fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
    if idx >= 128u {
        return;
    }
    let prev = registers[idx];
    var out = prev;
    if (mask & 1u) != 0u { out.x = value.x; }
    if (mask & 2u) != 0u { out.y = value.y; }
    if (mask & 4u) != 0u { out.z = value.z; }
    if (mask & 8u) != 0u { out.w = value.w; }
    registers[idx] = out;
 }
 // ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
 // `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
 // `xenia_gpu::ucode::alu::{vop,sop}`).
 const VOP_ADD:          u32 = 0u;
 const VOP_MUL:          u32 = 1u;
 const VOP_MAX:          u32 = 2u;
 const VOP_MIN:          u32 = 3u;
 const VOP_SEQ:          u32 = 4u;
 const VOP_SGT:          u32 = 5u;
 const VOP_SGE:          u32 = 6u;
 const VOP_SNE:          u32 = 7u;
 const VOP_FRC:          u32 = 8u;
 const VOP_TRUNC:        u32 = 9u;
 const VOP_FLOOR:        u32 = 10u;
 const VOP_MAD:          u32 = 11u;
 const VOP_CND_EQ:       u32 = 12u;
 const VOP_CND_GE:       u32 = 13u;
 const VOP_CND_GT:       u32 = 14u;
 const VOP_DP4:          u32 = 15u;
 const VOP_DP3:          u32 = 16u;
 const VOP_DP2_ADD:      u32 = 17u;
 const VOP_CUBE:         u32 = 18u;
 const VOP_MAX4:         u32 = 19u;
 const VOP_KILL_EQ:      u32 = 24u;
 const VOP_KILL_GT:      u32 = 25u;
 const VOP_KILL_GE:      u32 = 26u;
 const VOP_KILL_NE:      u32 = 27u;
 const VOP_DST:          u32 = 28u;
 const VOP_MAX_A:        u32 = 29u;
 const SOP_ADDS:         u32 = 0u;
 const SOP_ADDS_PREV:    u32 = 1u;
 const SOP_MULS:         u32 = 2u;
 const SOP_MULS_PREV:    u32 = 3u;
 const SOP_MAXS:         u32 = 5u;
 const SOP_MINS:         u32 = 6u;
 const SOP_SEQS:         u32 = 7u;
 const SOP_SGTS:         u32 = 8u;
 const SOP_SGES:         u32 = 9u;
 const SOP_SNES:         u32 = 10u;
 const SOP_FRCS:         u32 = 11u;
 const SOP_TRUNCS:       u32 = 12u;
 const SOP_FLOORS:       u32 = 13u;
 const SOP_EXP:          u32 = 14u;
 const SOP_LOGC:         u32 = 15u;
 const SOP_LOG:          u32 = 16u;
 const SOP_RCPC:         u32 = 17u;
 const SOP_RCPF:         u32 = 18u;
 const SOP_RCP:          u32 = 19u;
 const SOP_RSQC:         u32 = 20u;
 const SOP_RSQF:         u32 = 21u;
 const SOP_RSQ:          u32 = 22u;
 const SOP_SUBS:         u32 = 25u;
 const SOP_SUBS_PREV:    u32 = 26u;
 const SOP_SETP_EQ:      u32 = 27u;
 const SOP_SETP_NE:      u32 = 28u;
 const SOP_SETP_GT:      u32 = 29u;
 const SOP_SETP_GE:      u32 = 30u;
 const SOP_SETP_INV:     u32 = 31u;
 const SOP_SETP_POP:     u32 = 32u;
 const SOP_SETP_CLR:     u32 = 33u;
 const SOP_SETP_RSTR:    u32 = 34u;
 const SOP_KILLS_EQ:     u32 = 35u;
 const SOP_KILLS_GT:     u32 = 36u;
 const SOP_KILLS_GE:     u32 = 37u;
 const SOP_KILLS_NE:     u32 = 38u;
 const SOP_KILLS_ONE:    u32 = 39u;
 const SOP_SQRT:         u32 = 40u;
 const SOP_SIN:          u32 = 48u;
 const SOP_COS:          u32 = 49u;
 const SOP_RETAIN_PREV:  u32 = 50u;
 // Read a vec4 source from the register file. Treats the src index as a
 // direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
 // M4+ will extend this to decode the full operand header.
 fn read_src(idx: u32) -> vec4<f32> {
    return registers[idx & 0x7Fu];
 }
 fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
    switch op {
        case VOP_ADD:     { return a + b; }
        case VOP_MUL:     { return a * b; }
        case VOP_MAX:     { return max(a, b); }
        case VOP_MIN:     { return min(a, b); }
        case VOP_SEQ:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x == b.x),
                select(0.0, 1.0, a.y == b.y),
                select(0.0, 1.0, a.z == b.z),
                select(0.0, 1.0, a.w == b.w),
            );
        }
        case VOP_SGT:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x >  b.x),
                select(0.0, 1.0, a.y >  b.y),
                select(0.0, 1.0, a.z >  b.z),
                select(0.0, 1.0, a.w >  b.w),
            );
        }
        case VOP_SGE:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x >= b.x),
                select(0.0, 1.0, a.y >= b.y),
                select(0.0, 1.0, a.z >= b.z),
                select(0.0, 1.0, a.w >= b.w),
            );
        }
        case VOP_SNE:     {
            return vec4<f32>(
                select(0.0, 1.0, a.x != b.x),
                select(0.0, 1.0, a.y != b.y),
                select(0.0, 1.0, a.z != b.z),
                select(0.0, 1.0, a.w != b.w),
            );
        }
        case VOP_FRC:     { return fract(a); }
        case VOP_TRUNC:   { return trunc(a); }
        case VOP_FLOOR:   { return floor(a); }
        case VOP_MAD:     { return a * b + c; }
        case VOP_CND_EQ: {
            // dst = (src0 == 0) ? src1 : src2
            return vec4<f32>(
                select(c.x, b.x, a.x == 0.0),
                select(c.y, b.y, a.y == 0.0),
                select(c.z, b.z, a.z == 0.0),
                select(c.w, b.w, a.w == 0.0),
            );
        }
        case VOP_CND_GE: {
            return vec4<f32>(
                select(c.x, b.x, a.x >= 0.0),
                select(c.y, b.y, a.y >= 0.0),
                select(c.z, b.z, a.z >= 0.0),
                select(c.w, b.w, a.w >= 0.0),
            );
        }
        case VOP_CND_GT: {
            return vec4<f32>(
                select(c.x, b.x, a.x > 0.0),
                select(c.y, b.y, a.y > 0.0),
                select(c.z, b.z, a.z > 0.0),
                select(c.w, b.w, a.w > 0.0),
            );
        }
        case VOP_DP4:     {
            let d = dot(a, b);
            return vec4<f32>(d, d, d, d);
        }
        case VOP_DP3:     {
            let d = dot(a.xyz, b.xyz);
            return vec4<f32>(d, d, d, d);
        }
        case VOP_DP2_ADD: {
            let d = a.x * b.x + a.y * b.y + c.x;
            return vec4<f32>(d, d, d, d);
        }
        case VOP_MAX4: {
            let m = max(max(a.x, a.y), max(a.z, a.w));
            return vec4<f32>(m, m, m, m);
        }
        case VOP_KILL_EQ: {
            if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_KILL_GT: {
            if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_KILL_GE: {
            if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_KILL_NE: {
            if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
                kill_flag = true;
                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
            }
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
        case VOP_DST: {
            // dest = (1, src0.y * src1.y, src0.z, src1.w)
            return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
        }
        case VOP_CUBE, VOP_MAX_A: {
            // Cube face projection + MAX+AR are rare in P3c's target set;
            // forward to max() fallback so MAX_A degrades gracefully and
            // CUBE does *something* useful (max-axis selection).
            reject_mask |= REJECT_ALU_VEC;
            return max(a, b);
        }
        default: {
            // Unsupported — identity fallback + diagnostic flag.
            reject_mask |= REJECT_ALU_VEC;
            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
        }
    }
 }
 // Scalar op executor. Takes:
 //   `src_a` / `src_b` — the two scalar source components (most ops use
 //   only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
 //   `prev` — current `ps` chain value.
 // Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
 fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
    switch op {
        case SOP_ADDS:        { return src_a + src_b; }
        case SOP_ADDS_PREV:   { return src_a + prev; }
        case SOP_MULS:        { return src_a * src_b; }
        case SOP_MULS_PREV:   { return src_a * prev; }
        case SOP_SUBS:        { return src_a - src_b; }
        case SOP_SUBS_PREV:   { return src_a - prev; }
        case SOP_MAXS:        { return max(src_a, src_b); }
        case SOP_MINS:        { return min(src_a, src_b); }
        case SOP_SEQS:        { return select(0.0, 1.0, src_a == 0.0); }
        case SOP_SGTS:        { return select(0.0, 1.0, src_a >  0.0); }
        case SOP_SGES:        { return select(0.0, 1.0, src_a >= 0.0); }
        case SOP_SNES:        { return select(0.0, 1.0, src_a != 0.0); }
        case SOP_FRCS:        { return fract(src_a); }
        case SOP_TRUNCS:      { return trunc(src_a); }
        case SOP_FLOORS:      { return floor(src_a); }
        case SOP_EXP:         {
            // exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
            // WGSL's exp2 already produces for 0.
            return exp2(src_a);
        }
        case SOP_LOG, SOP_LOGC: {
            // log/logc both compute log2. Canary's LOGC clamps -INF to
            // -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
            // inputs > 0, and we leave src <= 0 to be the shader author's
            // problem (identity fallback via select).
            return select(log2(src_a), 0.0, src_a == 1.0);
        }
        case SOP_RCP, SOP_RCPC, SOP_RCPF: {
            // IEEE reciprocal with src==0 guard; the clamp-variants differ
            // only in how they treat INF/NaN, which is shader-author-rare.
            return select(0.0, 1.0 / src_a, src_a != 0.0);
        }
        case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
            return select(0.0, inverseSqrt(src_a), src_a > 0.0);
        }
        case SOP_SQRT:        { return select(0.0, sqrt(src_a), src_a >= 0.0); }
        case SOP_SIN:         { return sin(src_a); }
        case SOP_COS:         { return cos(src_a); }
        // Predicate writes — update `predicate` and produce a result that
        // the surrounding ALU slot can still consume via `ps`. Canary's
        // setp-variant dst-write semantics are preserved.
        case SOP_SETP_EQ: {
            predicate = (src_a == 0.0);
            return select(1.0, 0.0, src_a == 0.0);
        }
        case SOP_SETP_NE: {
            predicate = (src_a != 0.0);
            return select(1.0, 0.0, src_a != 0.0);
        }
        case SOP_SETP_GT: {
            predicate = (src_a > 0.0);
            return select(1.0, 0.0, src_a > 0.0);
        }
        case SOP_SETP_GE: {
            predicate = (src_a >= 0.0);
            return select(1.0, 0.0, src_a >= 0.0);
        }
        case SOP_SETP_INV: {
            if src_a == 1.0 {
                predicate = true;
                return 0.0;
            } else {
                predicate = false;
                return select(src_a, 1.0, src_a == 0.0);
            }
        }
        case SOP_SETP_POP: {
            if src_a - 1.0 <= 0.0 {
                predicate = true;
                return 0.0;
            } else {
                predicate = false;
                return src_a - 1.0;
            }
        }
        case SOP_SETP_CLR: {
            predicate = false;
            // FLT_MAX sentinel. WGSL's bitcast keeps this portable.
            return bitcast<f32>(0x7F7FFFFFu);
        }
        case SOP_SETP_RSTR: {
            if src_a == 0.0 {
                predicate = true;
                return 0.0;
            } else {
                predicate = false;
                return src_a;
            }
        }
        // Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
        // canary, and `discard` runs at fragment exit when the flag is set.
        case SOP_KILLS_EQ: {
            if src_a == 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_GT: {
            if src_a > 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_GE: {
            if src_a >= 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_NE: {
            if src_a != 0.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_KILLS_ONE: {
            if src_a == 1.0 { kill_flag = true; return 1.0; }
            return 0.0;
        }
        case SOP_RETAIN_PREV: { return prev; }
        default: {
            reject_mask |= REJECT_ALU_SCL;
            return 0.0;
        }
    }
 }
 fn interpret_alu(t: u32, is_vertex: bool) {
    // Read the 3-dword instruction triple.
    var w0: u32;
    var w1: u32;
    var w2: u32;
    if is_vertex {
        w0 = vs_instr_dword(t, 0u);
        w1 = vs_instr_dword(t, 1u);
        w2 = vs_instr_dword(t, 2u);
    } else {
        w0 = ps_instr_dword(t, 0u);
        w1 = ps_instr_dword(t, 1u);
        w2 = ps_instr_dword(t, 2u);
    }
    // Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
    let vec_op  = w2 & 0x3Fu;
    let scl_op  = (w2 >> 6u) & 0x3Fu;
    let vec_dst = (w2 >> 16u) & 0x7Fu;
    let scl_dst = (w2 >> 24u) & 0x7Fu;
    let vec_wm  = (w2 >> 12u) & 0xFu;
    let scl_wm  = (w2 >> 8u)  & 0xFu;
    let src_a   = w0 & 0xFFu;
    let src_b   = (w0 >> 8u)  & 0xFFu;
    let src_c   = (w0 >> 16u) & 0xFFu;
    let predicated           = ((w0 >> 27u) & 1u) != 0u;
    let predicate_condition  = ((w0 >> 28u) & 1u) != 0u;
    let scalar_src_is_ps     = ((w0 >> 26u) & 1u) != 0u;
    // `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
    // treats every source as a full r#, no modifiers — M4+ decodes it.
    _ = w1;
    // Honor per-instruction predicate: skip when predicated and the
    // predicate doesn't match the required condition.
    if predicated && (predicate != predicate_condition) {
        return;
    }
    // Vector pipe.
    let a = read_src(src_a);
    let b = read_src(src_b);
    let c = read_src(src_c);
    let vec_result = exec_vector_op(vec_op, a, b, c);
    if vec_wm != 0u {
        write_reg_masked(vec_dst, vec_wm, vec_result);
    }
    // Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
    // read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
    // is set the operand selector chooses `ps` as the primary source.
    let scl_src_a = select(a.x, ps, scalar_src_is_ps);
    let scl_src_b = b.x;
    let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
    ps = new_ps;
    if scl_wm != 0u {
        write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
    }
 }
 // Xenos VertexFormat values from `xenos.h:641`.
 const VFMT_8_8_8_8:         u32 = 6u;
 const VFMT_2_10_10_10:      u32 = 7u;
 const VFMT_10_11_11:        u32 = 16u;
 const VFMT_11_11_10:        u32 = 17u;
 const VFMT_16_16:           u32 = 25u;
 const VFMT_16_16_16_16:     u32 = 26u;
 const VFMT_16_16_FLOAT:     u32 = 31u;
 const VFMT_16_16_16_16_FLOAT:u32 = 32u;
 const VFMT_32:              u32 = 33u;
 const VFMT_32_32:           u32 = 34u;
 const VFMT_32_32_32_32:     u32 = 35u;
 const VFMT_32_FLOAT:        u32 = 36u;
 const VFMT_32_32_FLOAT:     u32 = 37u;
 const VFMT_32_32_32_32_FLOAT:u32 = 38u;
 const VFMT_32_32_32_FLOAT:  u32 = 57u;
 // Decode vertex fetch instruction fields (canary's VertexFetchInstruction
 // layout in `ucode.h:690`):
 //   w0 [4:0]   opcode
 //   w0 [10:5]  src_reg[5:0]
 //   w0 [17:11] dst_reg[6:0] + must-be-one
 //   w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
 //   w1 [21:16] format[5:0]
 //   w2 [7:0]   stride (in dwords)
 //   w2 [30:8]  offset (signed, in dwords)
 //
 // Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
 // sign/normalization flags are used for the most-common normalized-unsigned
 // path. Rejects set `REJECT_VFETCH_FMT`.
 fn interpret_vertex_fetch(t: u32) {
    let w0 = vs_instr_dword(t, 0u);
    let w1 = vs_instr_dword(t, 1u);
    let w2 = vs_instr_dword(t, 2u);
    let fetch_const = (w0 >> 5u) & 0x1Fu;
    let dst_reg = (w0 >> 10u) & 0x7Fu;
    let src_reg = (w0 >> 17u) & 0x7Fu;
    let format  = (w1 >> 16u) & 0x3Fu;
    let stride  = w2 & 0xFFu;
    // Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
    // dword 1 carries (endian[1:0], size[25:2]).
    let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
    let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
    let vidx = u32(registers[src_reg & 0x7Fu].x);
    // Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
    let effective_stride = select(stride, 4u, stride == 0u);
    let addr = base_dwords + vidx * effective_stride;
    let n = arrayLength(&vertex_buffer);
    var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);
    switch format {
        case VFMT_32_32_32_32_FLOAT: {
            if addr + 3u < n {
                result = vec4<f32>(
                    bitcast<f32>(vertex_buffer[addr + 0u]),
                    bitcast<f32>(vertex_buffer[addr + 1u]),
                    bitcast<f32>(vertex_buffer[addr + 2u]),
                    bitcast<f32>(vertex_buffer[addr + 3u]),
                );
            }
        }
        case VFMT_32_32_32_FLOAT: {
            if addr + 2u < n {
                result = vec4<f32>(
                    bitcast<f32>(vertex_buffer[addr + 0u]),
                    bitcast<f32>(vertex_buffer[addr + 1u]),
                    bitcast<f32>(vertex_buffer[addr + 2u]),
                    1.0,
                );
            }
        }
        case VFMT_32_32_FLOAT: {
            if addr + 1u < n {
                result = vec4<f32>(
                    bitcast<f32>(vertex_buffer[addr + 0u]),
                    bitcast<f32>(vertex_buffer[addr + 1u]),
                    0.0,
                    1.0,
                );
            }
        }
        case VFMT_32_FLOAT: {
            if addr < n {
                result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
            }
        }
        case VFMT_8_8_8_8: {
            if addr < n {
                result = unpack4x8unorm(vertex_buffer[addr]);
            }
        }
        case VFMT_16_16_FLOAT: {
            if addr < n {
                let h = unpack2x16float(vertex_buffer[addr]);
                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
            }
        }
        case VFMT_16_16_16_16_FLOAT: {
            if addr + 1u < n {
                let h0 = unpack2x16float(vertex_buffer[addr]);
                let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
            }
        }
        case VFMT_16_16: {
            if addr < n {
                // Default to signed normalized; unsigned variants differ
                // only for sign-extension and are less common on Xenos VBs.
                let h = unpack2x16snorm(vertex_buffer[addr]);
                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
            }
        }
        case VFMT_16_16_16_16: {
            if addr + 1u < n {
                let h0 = unpack2x16snorm(vertex_buffer[addr]);
                let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
            }
        }
        case VFMT_2_10_10_10: {
            // Unpack 10-bit R/G/B (signed or unsigned, default unsigned
            // normalized) + 2-bit A. Unnormalized not exercised here.
            if addr < n {
                let packed = vertex_buffer[addr];
                let r = f32(packed & 0x3FFu) / 1023.0;
                let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
                let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
                let a = f32((packed >> 30u) & 0x3u) / 3.0;
                result = vec4<f32>(r, g, b, a);
            }
        }
        default: {
            reject_mask |= REJECT_VFETCH_FMT;
            // Identity fallback preserves vertex-index visibility.
            if addr < n {
                result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
            }
        }
    }
    registers[dst_reg & 0x7Fu] = result;
 }
 // M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
 // dummy bound at group(1); the real per-slot texture cache lands with P5.
 // Reads (u, v) from the source register's .xy and writes the sample into
 // the destination register. `textureSampleLevel` works in both VS and PS
 // (no implicit derivatives), so no per-stage specialisation needed.
 fn interpret_texture_fetch(t: u32, is_vertex: bool) {
    var w0: u32 = 0u;
    if is_vertex {
        w0 = vs_instr_dword(t, 0u);
    } else {
        w0 = ps_instr_dword(t, 0u);
    }
    let dst_reg = (w0 >> 10u) & 0x7Fu;
    let src_reg = (w0 >> 17u) & 0x7Fu;
    let uv = registers[src_reg & 0x7Fu].xy;
    let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
    registers[dst_reg & 0x7Fu] = sample;
 }
 // Walk an Exec clause's instruction triples.
 //   sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
 //             (we ignore in MVP); bit 1 = is-fetch.
 fn exec_vs(address: u32, count: u32, sequence: u32) {
    for (var i: u32 = 0u; i < count; i = i + 1u) {
        let t = address + i;
        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
        if is_fetch {
            let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
            // 0x00 = vertex fetch, 0x01 = texture fetch.
            if opcode == 0u {
                interpret_vertex_fetch(t);
            } else if opcode == 1u {
                interpret_texture_fetch(t, true);
            }
        } else {
            interpret_alu(t, true);
        }
    }
 }
 fn exec_ps(address: u32, count: u32, sequence: u32) {
    for (var i: u32 = 0u; i < count; i = i + 1u) {
        let t = address + i;
        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
        if is_fetch {
            interpret_texture_fetch(t, false);
        } else {
            interpret_alu(t, false);
        }
    }
 }
 // Reset the per-invocation register state to a known baseline.
 fn reset_state() {
    for (var i: u32 = 0u; i < 128u; i = i + 1u) {
        registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
    }
    ps = 0.0;
    predicate = false;
    current_alloc = 0u;
    kill_flag = false;
    loop_depth = 0u;
    reject_mask = 0u;
    for (var i: u32 = 0u; i < 4u; i = i + 1u) {
        loop_counters[i] = 0u;
        loop_starts[i] = 0u;
    }
 }
 // ── Stage entry points.
 // M7 register slots for exports. VS writes position at oPos (convention:
 // Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
 // and a set of interpolators. We track both via `current_alloc`: writes
 // inside each alloc range are tagged and copied out at Exit.
 const OPOS_REG:       u32 = 32u;  // synthetic slot used by the interpreter
 const OCOLOR_REG:     u32 = 33u;  // color0 scratch slot
@vertex
 fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
    reset_state();
    // Seed r0 with the vertex index so simple shaders (or the procedural
    // fallback) have access without a real vertex fetch.
    registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
    // Seed the export slots with a procedural fallback: if the shader
    // never writes oPos / oColor, this keeps the output visible rather
    // than collapsing to (0,0) which would skip rasterization.
    let total = max(draw_ctx.vertex_count, 1u);
    let t_param = f32(vidx) / f32(total);
    let angle = t_param * 6.2831853;
    let radius = 0.35;
    registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
    let d = f32(draw_ctx.draw_index);
    registers[OCOLOR_REG] = vec4<f32>(
        0.5 + 0.5 * sin(d * 0.37),
        0.5 + 0.5 * sin(d * 0.51 + 2.0),
        0.5 + 0.5 * sin(d * 0.73 + 4.0),
        1.0,
    );
    // Dead-binding guard for VERTEX-stage-only vertex_buffer access.
    let vb_live = f32(vertex_buffer[0]) * 0.0;
    // Walk the VS CF table.
    walk_cf_vs();
    var out: VsOut;
    // Use registers[OPOS_REG] as position; the procedural fallback above
    // seeded it so an un-interpreted shader still draws a recognisable
    // circle.
    out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
    out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
    return out;
 }
@fragment
 fn fs_main(in: VsOut) -> FsOut {
    reset_state();
    walk_cf_ps();
    // Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
    // `discard` at the entry level (outside any helper) is the only way to
    // guarantee early-out in WGSL.
    if kill_flag {
        discard;
    }
    var out: FsOut;
    out.color0 = in.color;
    return out;
 }
 // ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
 // from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
 // predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
 // we don't have). A hard iteration cap keeps the GPU from hanging on
 // malformed or extreme shaders.
 const CF_WALKER_MAX_ITER: u32 = 4096u;
 fn walk_cf_vs() {
    let cf_n = vs_cf_count();
    var cf_i: u32 = 0u;
    var iter: u32 = 0u;
    loop {
        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
        iter = iter + 1u;
        let kind = vs_cf_kind(cf_i);
        let primary = vs_cf_primary(cf_i);
        let aux = vs_cf_aux(cf_i);
        var advance: bool = true;
        var stop: bool = false;
        switch kind {
            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
                let count = aux & 0xFFu;
                let sequence = aux >> 8u;
                exec_vs(primary, count, sequence);
                if kind == CF_KIND_EXEC_END { stop = true; }
            }
            case CF_KIND_ALLOC: { current_alloc = primary; }
            case CF_KIND_EXIT:  { stop = true; }
            case CF_KIND_LOOP_START: {
                let loop_id = aux & 0x1Fu;
                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
                if loop_count > 16u {
                    loop_count = 16u;
                    reject_mask |= REJECT_LOOP_OVERFLOW;
                }
                if loop_count > 0u && loop_depth < 4u {
                    loop_starts[loop_depth] = cf_i;
                    loop_counters[loop_depth] = loop_count;
                    loop_depth = loop_depth + 1u;
                }
                // count==0 → fall through; matching LOOP_END will pop.
            }
            case CF_KIND_LOOP_END: {
                if loop_depth > 0u {
                    let d = loop_depth - 1u;
                    if loop_counters[d] > 1u {
                        loop_counters[d] = loop_counters[d] - 1u;
                        cf_i = loop_starts[d] + 1u;
                        advance = false;
                    } else {
                        loop_counters[d] = 0u;
                        loop_depth = d;
                    }
                }
            }
            case CF_KIND_COND_JMP: {
                let pred_bits = aux;
                let is_pred  = (pred_bits & 1u) != 0u;
                let pred_cnd = (pred_bits & 2u) != 0u;
                if !is_pred || predicate == pred_cnd {
                    cf_i = primary;
                    advance = false;
                }
            }
            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
                // No call stack — mark and continue.
                reject_mask |= REJECT_CF_CALL;
            }
            default: { reject_mask |= REJECT_CF_JUMP; }
        }
        if stop { break; }
        if advance { cf_i = cf_i + 1u; }
    }
 }
 fn walk_cf_ps() {
    let cf_n = ps_cf_count();
    var cf_i: u32 = 0u;
    var iter: u32 = 0u;
    loop {
        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
        iter = iter + 1u;
        let kind = ps_cf_kind(cf_i);
        let primary = ps_cf_primary(cf_i);
        let aux = ps_cf_aux(cf_i);
        var advance: bool = true;
        var stop: bool = false;
        switch kind {
            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
                let count = aux & 0xFFu;
                let sequence = aux >> 8u;
                exec_ps(primary, count, sequence);
                if kind == CF_KIND_EXEC_END { stop = true; }
            }
            case CF_KIND_ALLOC: { current_alloc = primary; }
            case CF_KIND_EXIT:  { stop = true; }
            case CF_KIND_LOOP_START: {
                let loop_id = aux & 0x1Fu;
                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
                if loop_count > 16u {
                    loop_count = 16u;
                    reject_mask |= REJECT_LOOP_OVERFLOW;
                }
                if loop_count > 0u && loop_depth < 4u {
                    loop_starts[loop_depth] = cf_i;
                    loop_counters[loop_depth] = loop_count;
                    loop_depth = loop_depth + 1u;
                }
            }
            case CF_KIND_LOOP_END: {
                if loop_depth > 0u {
                    let d = loop_depth - 1u;
                    if loop_counters[d] > 1u {
                        loop_counters[d] = loop_counters[d] - 1u;
                        cf_i = loop_starts[d] + 1u;
                        advance = false;
                    } else {
                        loop_counters[d] = 0u;
                        loop_depth = d;
                    }
                }
            }
            case CF_KIND_COND_JMP: {
                let pred_bits = aux;
                let is_pred  = (pred_bits & 1u) != 0u;
                let pred_cnd = (pred_bits & 2u) != 0u;
                if !is_pred || predicate == pred_cnd {
                    cf_i = primary;
                    advance = false;
                }
            }
            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
                reject_mask |= REJECT_CF_CALL;
            }
            default: { reject_mask |= REJECT_CF_JUMP; }
        }
        if stop { break; }
        if advance { cf_i = cf_i + 1u; }
    }
 }
--- a/crates/xenia-gpu/src/texture_cache.rs
+++ b/crates/xenia-gpu/src/texture_cache.rs
@@ -0,0 +1,970 @@
 //! Texture cache — P5.
 //!
 //! Two-layer design mirroring canary's `TextureCache`:
 //!
 //!  * **CPU layer** (this module): owns decoded, linear, host-endian texel
 //!    byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the
 //!    guest memory's page-version counter to decide whether the cached
 //!    bytes are still fresh and re-decodes on miss or staleness.
 //!  * **GPU layer** (xenia-ui `texture_cache_host`): owns the
 //!    `wgpu::Texture` + `TextureView` for each cached key; pulls decoded
 //!    bytes from this CPU layer on upload.
 //!
 //! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and
 //! `texture_info_formats.inl` for the format table.
 use std::collections::HashMap;
 use crate::tiled_address;
 /// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`.
 /// Values are the raw enum numbers the guest writes into
 /// `xe_gpu_texture_fetch_t.format`.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 #[repr(u8)]
 pub enum TextureFormat {
    K1Reverse = 0,
    K1 = 1,
    K8 = 2,
    K1555 = 3,
    K565 = 4,
    K6_5_5 = 5,
    K8888 = 6,
    K1010102 = 7,
    K8_8 = 10,
    K4_4_4_4 = 15,
    K10_11_11 = 16,
    K11_11_10 = 17,
    Dxt1 = 18,
    Dxt2_3 = 19,
    Dxt4_5 = 20,
    K24_8 = 22,
    K24_8Float = 23,
    K16 = 24,
    K16_16 = 25,
    K16_16_16_16 = 26,
    K16Float = 30,
    K16_16Float = 31,
    K16_16_16_16Float = 32,
    K32 = 33,
    K32_32 = 34,
    K32_32_32_32 = 35,
    K32Float = 36,
    K32_32Float = 37,
    K32_32_32_32Float = 38,
    Unknown(u8),
 }
 impl TextureFormat {
    pub fn from_raw(v: u8) -> Self {
        use TextureFormat::*;
        match v & 0x3F {
            0 => K1Reverse,
            1 => K1,
            2 => K8,
            3 => K1555,
            4 => K565,
            5 => K6_5_5,
            6 => K8888,
            7 => K1010102,
            10 => K8_8,
            15 => K4_4_4_4,
            16 => K10_11_11,
            17 => K11_11_10,
            18 => Dxt1,
            19 => Dxt2_3,
            20 => Dxt4_5,
            22 => K24_8,
            23 => K24_8Float,
            24 => K16,
            25 => K16_16,
            26 => K16_16_16_16,
            30 => K16Float,
            31 => K16_16Float,
            32 => K16_16_16_16Float,
            33 => K32,
            34 => K32_32,
            35 => K32_32_32_32,
            36 => K32Float,
            37 => K32_32Float,
            38 => K32_32_32_32Float,
            other => Unknown(other),
        }
    }
    /// Block width/height in texels + bytes-per-block. For uncompressed
    /// formats block_w = block_h = 1. For DXT formats block_w = block_h =
    /// 4 (one 4×4 compressed block).
    pub fn block_info(self) -> BlockInfo {
        use TextureFormat::*;
        match self {
            K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte
            K8 => BlockInfo::new(1, 1, 1),
            K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2),
            K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16
            | K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4),
            K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8),
            K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16),
            Dxt1 => BlockInfo::new(4, 4, 8),
            Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16),
            Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback
        }
    }
    /// True iff this format lands on a wgpu texture format we can
    /// natively bind — no CPU-side conversion per frame required. M5
    /// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still
    /// counts as supported for the host-cache wiring), `k_DXT2_3`
    /// (BC2), and `k_DXT4_5` (BC3).
    pub fn is_host_supported(self) -> bool {
        matches!(
            self,
            TextureFormat::K8888
                | TextureFormat::K565
                | TextureFormat::Dxt1
                | TextureFormat::Dxt2_3
                | TextureFormat::Dxt4_5
        )
    }
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub struct BlockInfo {
    pub block_w: u8,
    pub block_h: u8,
    pub bytes_per_block: u8,
 }
 impl BlockInfo {
    pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self {
        Self {
            block_w,
            block_h,
            bytes_per_block,
        }
    }
    pub fn log2_bpb(self) -> u32 {
        match self.bytes_per_block {
            1 => 0,
            2 => 1,
            4 => 2,
            8 => 3,
            16 => 4,
            _ => 0,
        }
    }
 }
 /// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum Endian {
    None = 0,
    Swap8In16 = 1,
    Swap8In32 = 2,
    Swap16In32 = 3,
 }
 impl Endian {
    pub fn from_raw(v: u8) -> Self {
        match v & 0x3 {
            1 => Endian::Swap8In16,
            2 => Endian::Swap8In32,
            3 => Endian::Swap16In32,
            _ => Endian::None,
        }
    }
    /// Apply this endian's byte swap to one 32-bit unit. Matches canary's
    /// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls
    /// the same mask-shift pattern.
    pub fn swap32(self, v: u32) -> u32 {
        match self {
            Endian::None => v,
            Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8),
            Endian::Swap8In32 => v.swap_bytes(),
            Endian::Swap16In32 => v.rotate_right(16),
        }
    }
 }
 /// Texture dimensionality (`xenos::DataDimension`).
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum Dimension {
    D1 = 0,
    D2 = 1,
    D3Stacked = 2,
    Cube = 3,
 }
 impl Dimension {
    pub fn from_raw(v: u8) -> Self {
        match v & 0x3 {
            1 => Dimension::D2,
            2 => Dimension::D3Stacked,
            3 => Dimension::Cube,
            _ => Dimension::D1,
        }
    }
 }
 /// Identity of a cached texture. Matches canary's `TextureCache::TextureKey`
 /// at the semantic level — we exclude mip/border state for P5 since neither
 /// is populated yet.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub struct TextureKey {
    /// Guest physical base (byte address — already shifted left by 12 from
    /// the fetch-constant `base_address` field).
    pub base_address: u32,
    pub width: u16,
    pub height: u16,
    pub depth_or_slices: u16,
    pub format: TextureFormat,
    pub endian: Endian,
    pub dimension: Dimension,
    pub tiled: bool,
    /// Row pitch in texels, already aligned to 32. Canary stores pitch/32
    /// in the fetch constant; we keep the raw texel count to avoid
    /// callers remembering to shift.
    pub pitch_texels: u16,
 }
 /// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`).
 /// Returns `None` if the constant is obviously unset (all zeros) or if
 /// `type` is not the texture-constant marker.
 pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option<TextureKey> {
    let d0 = dwords[0];
    let d1 = dwords[1];
    let d2 = dwords[2];
    let d5 = dwords[5];
    // type: low 2 bits of dword 0 should be 2 (texture) per canary —
    // 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so
    // `None` filters it out here.
    let ty = d0 & 0x3;
    if d0 == 0 && d1 == 0 {
        return None;
    }
    // Not a texture constant (e.g. 0 = vertex fetch constant reused).
    if ty != 2 {
        return None;
    }
    let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels
    let tiled = ((d0 >> 31) & 1) != 0;
    let format = TextureFormat::from_raw((d1 & 0x3F) as u8);
    let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8);
    let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted.
    let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8);
    // Size decode depends on dimension.
    let (width, height, depth) = match dim {
        Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16),
        Dimension::D2 => (
            (d2 & 0x1FFF) as u16 + 1,
            ((d2 >> 13) & 0x1FFF) as u16 + 1,
            ((d2 >> 26) & 0x3F) as u16 + 1,
        ),
        Dimension::D3Stacked | Dimension::Cube => (
            (d2 & 0x7FF) as u16 + 1,
            ((d2 >> 11) & 0x7FF) as u16 + 1,
            ((d2 >> 22) & 0x3FF) as u16 + 1,
        ),
    };
    Some(TextureKey {
        base_address,
        width,
        height,
        depth_or_slices: depth,
        format,
        endian,
        dimension: dim,
        tiled,
        pitch_texels: ((pitch_5 as u16) * 32).max(width),
    })
 }
 /// Decoded, linear, host-endian texture bytes ready for wgpu upload.
 #[derive(Debug, Clone)]
 pub struct CachedTexture {
    pub key: TextureKey,
    pub version_when_uploaded: u64,
    /// Tightly packed. Layout depends on `key.format`:
    /// - `K8888` → `width*height*4` bytes in Rgba8Unorm order.
    /// - `Dxt1`  → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after
    ///   block-level detile + dword-endian swap.
    pub bytes: Vec<u8>,
 }
 impl CachedTexture {
    pub fn byte_size(&self) -> usize {
        self.bytes.len()
    }
 }
 /// Errors that can happen during decode. The `ensure_cached` caller maps
 /// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when
 /// a texture fell back.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum DecodeError {
    UnsupportedFormat,
    OutOfBounds,
    ZeroSize,
 }
 /// Read `len` bytes from guest memory starting at `addr`. Returns `None`
 /// if the span would exceed the memory's reported end; otherwise returns
 /// a freshly-allocated buffer with the bytes.
 ///
 /// The `MemoryAccess` trait exposes per-byte reads only; we batch them in
 /// a single pass to avoid the per-byte virtual dispatch overhead for large
 /// textures (1 MiB frontbuffer = 1M dispatch calls).
 pub fn read_guest_bytes(
    mem: &dyn xenia_memory::MemoryAccess,
    addr: u32,
    len: usize,
 ) -> Vec<u8> {
    let mut out = Vec::with_capacity(len);
    for i in 0..len {
        let a = addr.wrapping_add(i as u32);
        out.push(mem.read_u8(a));
        if a < addr {
            // 32-bit overflow; unmap the tail.
            break;
        }
    }
    out
 }
 /// Byte-swap the 32-bit dwords of `buf` in place according to `endian`.
 /// `buf.len()` should be a multiple of 4; tail bytes are left untouched.
 pub fn apply_endian_32(buf: &mut [u8], endian: Endian) {
    if matches!(endian, Endian::None) {
        return;
    }
    let mut i = 0;
    while i + 4 <= buf.len() {
        let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]);
        let swapped = endian.swap32(v);
        buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes());
        i += 4;
    }
 }
 /// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes.
 /// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory →
 /// we emit RGBA for wgpu) and the declared endian swap, then detiles via
 /// the Xenos Tiled2D formula.
 pub fn decode_k8888_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
 ) -> Result<Vec<u8>, DecodeError> {
    if key.width == 0 || key.height == 0 {
        return Err(DecodeError::ZeroSize);
    }
    let w = key.width as u32;
    let h = key.height as u32;
    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
    let total_bytes = (pitch_aligned * h * 4) as usize;
    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
    if raw.len() < total_bytes {
        return Err(DecodeError::OutOfBounds);
    }
    apply_endian_32(&mut raw, key.endian);
    let mut linear = vec![0u8; (w * h * 4) as usize];
    if key.tiled {
        if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() {
            return Err(DecodeError::OutOfBounds);
        }
    } else {
        // Non-tiled copy row-by-row honoring pitch.
        for y in 0..h as usize {
            let src = y * (pitch_aligned as usize) * 4;
            let dst = y * (w as usize) * 4;
            linear[dst..dst + (w as usize) * 4]
                .copy_from_slice(&raw[src..src + (w as usize) * 4]);
        }
    }
    // Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After
    // endian.Swap8In32 guests' typical per-dword byte order becomes BGRA
    // in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu.
    for px in linear.chunks_exact_mut(4) {
        px.swap(0, 2);
    }
    Ok(linear)
 }
 /// Decode a DXT-compressed texture to raw block bytes (no format
 /// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the
 /// GPU does the actual decompression on upload).
 ///
 /// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D
 /// formula, with stride counted in blocks. `bytes_per_block` is 8 for
 /// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5).
 pub fn decode_dxt_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
    bytes_per_block: u32,
 ) -> Result<Vec<u8>, DecodeError> {
    if key.width == 0 || key.height == 0 {
        return Err(DecodeError::ZeroSize);
    }
    let block_w = 4u32;
    let block_h = 4u32;
    let w_blocks = (key.width as u32).div_ceil(block_w);
    let h_blocks = (key.height as u32).div_ceil(block_h);
    let pitch_blocks = tiled_address::align_pitch_to_macro_tile(
        (key.pitch_texels as u32).div_ceil(block_w),
    );
    let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize;
    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
    if raw.len() < total_bytes {
        return Err(DecodeError::OutOfBounds);
    }
    // DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar
    // u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word
    // order requires an endian swap at the u16/u32 level regardless of
    // which BC-family format.
    apply_endian_32(&mut raw, key.endian);
    let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize];
    if key.tiled {
        if tiled_address::detile_2d(
            &raw,
            &mut out,
            w_blocks,
            h_blocks,
            pitch_blocks,
            bytes_per_block,
        )
        .is_err()
        {
            return Err(DecodeError::OutOfBounds);
        }
    } else {
        for y in 0..h_blocks as usize {
            let src = y * (pitch_blocks as usize) * (bytes_per_block as usize);
            let dst = y * (w_blocks as usize) * (bytes_per_block as usize);
            out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)]
                .copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]);
        }
    }
    Ok(out)
 }
 /// BC1 / DXT1 — 8-byte blocks.
 pub fn decode_dxt1_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
 ) -> Result<Vec<u8>, DecodeError> {
    decode_dxt_tiled(key, mem, 8)
 }
 /// BC2 / DXT2_3 — 16-byte blocks.
 pub fn decode_dxt23_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
 ) -> Result<Vec<u8>, DecodeError> {
    decode_dxt_tiled(key, mem, 16)
 }
 /// BC3 / DXT4_5 — 16-byte blocks.
 pub fn decode_dxt45_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
 ) -> Result<Vec<u8>, DecodeError> {
    decode_dxt_tiled(key, mem, 16)
 }
 /// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high
 /// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of
 /// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a
 /// sampled texture format on every backend, so CPU-side conversion is
 /// the safe path even if it's 2× the texture memory.
 ///
 /// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes),
 /// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear
 /// output buffer.
 pub fn decode_k565_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
 ) -> Result<Vec<u8>, DecodeError> {
    if key.width == 0 || key.height == 0 {
        return Err(DecodeError::ZeroSize);
    }
    let w = key.width as u32;
    let h = key.height as u32;
    // Pitch/block counts — block = 1 texel here, 2 bytes.
    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
    let total_bytes = (pitch_aligned * h * 2) as usize;
    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
    if raw.len() < total_bytes {
        return Err(DecodeError::OutOfBounds);
    }
    // 16-bit word order is endian-swap-sensitive.
    apply_endian_32(&mut raw, key.endian);
    // Step 1: detile (bytes_per_block=2, tile in blocks=texels).
    let mut linear_u16 = vec![0u8; (w * h * 2) as usize];
    if key.tiled {
        if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() {
            return Err(DecodeError::OutOfBounds);
        }
    } else {
        for y in 0..h as usize {
            let src = y * (pitch_aligned as usize) * 2;
            let dst = y * (w as usize) * 2;
            linear_u16[dst..dst + (w as usize) * 2]
                .copy_from_slice(&raw[src..src + (w as usize) * 2]);
        }
    }
    // Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16
    // is little-endian after `apply_endian_32` has normalized the word
    // order (we keep host-native byte ordering post-swap).
    let mut rgba = vec![0u8; (w * h * 4) as usize];
    for y in 0..h as usize {
        for x in 0..w as usize {
            let off = (y * w as usize + x) * 2;
            let lo = linear_u16[off];
            let hi = linear_u16[off + 1];
            let word = u16::from_le_bytes([lo, hi]);
            // 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4).
            // Expand to full-range u8: replicate high bits into low
            // (so 0b11111 → 0xFF, matching the standard 565→888 convention).
            let r5 = ((word >> 11) & 0x1F) as u8;
            let g6 = ((word >> 5) & 0x3F) as u8;
            let b5 = (word & 0x1F) as u8;
            let r = (r5 << 3) | (r5 >> 2);
            let g = (g6 << 2) | (g6 >> 4);
            let b = (b5 << 3) | (b5 >> 2);
            let o = (y * w as usize + x) * 4;
            rgba[o] = r;
            rgba[o + 1] = g;
            rgba[o + 2] = b;
            rgba[o + 3] = 0xFF;
        }
    }
    Ok(rgba)
 }
 /// Version-aware CPU-side texture cache. Entries are keyed on
 /// `TextureKey.hash` and carry a `version_when_uploaded` watermark against
 /// the guest memory's page-version counter. `ensure_cached` queries
 /// `GuestMemory::max_page_version` over the texture's byte span; if the
 /// span has been written since cache time, the entry is re-decoded.
 pub struct TextureCache {
    entries: HashMap<TextureKey, CachedTexture>,
    /// Monotonic counter of decodes performed — HUD surface.
    pub decodes_total: u64,
    /// Count of stale-miss re-decodes.
    pub restale_total: u64,
 }
 impl Default for TextureCache {
    fn default() -> Self {
        Self::new()
    }
 }
 impl TextureCache {
    pub fn new() -> Self {
        Self {
            entries: HashMap::new(),
            decodes_total: 0,
            restale_total: 0,
        }
    }
    pub fn len(&self) -> usize {
        self.entries.len()
    }
    pub fn is_empty(&self) -> bool {
        self.entries.is_empty()
    }
    pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> {
        self.entries.get(key)
    }
    /// Return a cached (or freshly-decoded) texture. The caller supplies
    /// the current guest-memory page version covering the texture span;
    /// see [`max_page_version_for`].
    pub fn ensure_cached(
        &mut self,
        key: TextureKey,
        current_version: u64,
        mem: &dyn xenia_memory::MemoryAccess,
    ) -> Result<&CachedTexture, DecodeError> {
        // Fast path: fresh entry exists.
        if let Some(e) = self.entries.get(&key) {
            if e.version_when_uploaded >= current_version {
                return Ok(self.entries.get(&key).unwrap());
            }
            self.restale_total += 1;
        }
        let bytes = match key.format {
            TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?,
            TextureFormat::K565 => decode_k565_tiled(&key, mem)?,
            TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?,
            TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?,
            TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?,
            _ => return Err(DecodeError::UnsupportedFormat),
        };
        self.decodes_total += 1;
        let entry = CachedTexture {
            key,
            version_when_uploaded: current_version,
            bytes,
        };
        self.entries.insert(key, entry);
        Ok(self.entries.get(&key).unwrap())
    }
    pub fn byte_budget(&self) -> usize {
        self.entries.values().map(|e| e.byte_size()).sum()
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::cell::Cell;
    struct FakeMem(Box<[Cell<u8>]>);
    impl FakeMem {
        fn from_vec(v: Vec<u8>) -> Self {
            FakeMem(v.into_iter().map(Cell::new).collect())
        }
    }
    impl xenia_memory::MemoryAccess for FakeMem {
        fn read_u8(&self, a: u32) -> u8 {
            self.0.get(a as usize).map(|c| c.get()).unwrap_or(0)
        }
        fn read_u16(&self, a: u32) -> u16 {
            u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)])
        }
        fn read_u32(&self, a: u32) -> u32 {
            u32::from_be_bytes([
                self.read_u8(a),
                self.read_u8(a + 1),
                self.read_u8(a + 2),
                self.read_u8(a + 3),
            ])
        }
        fn read_u64(&self, a: u32) -> u64 {
            u64::from_be_bytes([
                self.read_u8(a),
                self.read_u8(a + 1),
                self.read_u8(a + 2),
                self.read_u8(a + 3),
                self.read_u8(a + 4),
                self.read_u8(a + 5),
                self.read_u8(a + 6),
                self.read_u8(a + 7),
            ])
        }
        fn write_u8(&self, a: u32, v: u8) {
            if let Some(slot) = self.0.get(a as usize) {
                slot.set(v);
            }
        }
        fn write_u16(&self, a: u32, v: u16) {
            let b = v.to_be_bytes();
            self.write_u8(a, b[0]);
            self.write_u8(a + 1, b[1]);
        }
        fn write_u32(&self, a: u32, v: u32) {
            let b = v.to_be_bytes();
            for i in 0..4 {
                self.write_u8(a + i as u32, b[i]);
            }
        }
        fn write_u64(&self, a: u32, v: u64) {
            let b = v.to_be_bytes();
            for i in 0..8 {
                self.write_u8(a + i as u32, b[i]);
            }
        }
        fn translate(&self, _: u32) -> Option<*const u8> {
            None
        }
        fn translate_mut(&self, _: u32) -> Option<*mut u8> {
            None
        }
    }
    #[test]
    fn format_block_info_matches_canary_expectations() {
        assert_eq!(
            TextureFormat::K8888.block_info(),
            BlockInfo::new(1, 1, 4)
        );
        assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8));
        assert_eq!(
            TextureFormat::Dxt4_5.block_info(),
            BlockInfo::new(4, 4, 16)
        );
    }
    #[test]
    fn endian_swap_variants() {
        assert_eq!(Endian::None.swap32(0x11223344), 0x11223344);
        assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433);
        assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211);
        assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122);
    }
    #[test]
    fn decode_fetch_constant_rejects_empty() {
        let z = [0u32; 6];
        assert!(decode_fetch_constant(z).is_none());
    }
    #[test]
    fn decode_fetch_constant_parses_2d_k8888() {
        // Build a synthetic k_8_8_8_8 2D texture fetch constant:
        //   dword0: pitch_5=40 (1280/32), tiled=1, type=2
        //   dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12
        //   dword2: width-1=1279, height-1=719
        //   dword5: dimension=1 (2D)
        let d0 = 0x8000_0000 | (40u32 << 22) | 2;
        let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32;
        let d2 = 1279u32 | ((719u32) << 13);
        let d5 = 1u32 << 9;
        let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed");
        assert_eq!(k.format, TextureFormat::K8888);
        assert_eq!(k.endian, Endian::Swap8In32);
        assert_eq!(k.width, 1280);
        assert_eq!(k.height, 720);
        assert_eq!(k.dimension, Dimension::D2);
        assert!(k.tiled);
        assert_eq!(k.pitch_texels, 1280);
    }
    #[test]
    fn decode_k8888_roundtrip_linear() {
        // Build a 4×4 non-tiled image with pitch=32 (one macro-tile row).
        // Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a
        // big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must
        // be (x, y, y*4+x, 0xFF) in RGBA order.
        let w = 4u32;
        let h = 4u32;
        let pitch = 32u32;
        let mut bytes = vec![0u8; (pitch * h * 4) as usize];
        for y in 0..h {
            for x in 0..w {
                let off = ((y * pitch + x) * 4) as usize;
                let argb = (0xFFu32 << 24)
                    | ((x as u32) << 16)
                    | ((y as u32) << 8)
                    | ((y * 4 + x) as u32);
                bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes());
            }
        }
        let mem = FakeMem::from_vec(bytes);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::K8888,
            endian: Endian::Swap8In32,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: pitch as u16,
        };
        let out = decode_k8888_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 16 * 4);
        assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
        let off = ((3 * 4 + 3) * 4) as usize;
        assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]);
    }
    // ── First-Pixels M5 format tests ──────────────────────────────
    /// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block.
    /// Synthetic source of 0xDEADBEEF... bytes; assert the decoder
    /// returns the same bytes (passthrough after endian swap).
    #[test]
    fn decode_dxt23_small_roundtrip() {
        // 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32
        // (macro-tile-aligned) the block pitch is 8 (=32/4), and we
        // allocate 8*1*16 = 128 bytes of source.
        let mut bytes = vec![0u8; 128];
        for (i, b) in bytes.iter_mut().enumerate().take(16) {
            *b = i as u8;
        }
        let mem = FakeMem::from_vec(bytes);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::Dxt2_3,
            endian: Endian::None, // no swap — we can eyeball passthrough
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        let out = decode_dxt23_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 16); // 1 block × 16 bytes
        for i in 0..16 {
            assert_eq!(out[i], i as u8);
        }
    }
    /// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a
    /// parallel test prevents a regression that sneaks up via the
    /// generic `decode_dxt_tiled`.
    #[test]
    fn decode_dxt45_uses_16byte_blocks() {
        let mem = FakeMem::from_vec(vec![0xAAu8; 256]);
        let key = TextureKey {
            base_address: 0,
            width: 8,
            height: 4, // 2×1 blocks
            depth_or_slices: 1,
            format: TextureFormat::Dxt4_5,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        let out = decode_dxt45_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 2 * 16);
    }
    /// k_5_6_5: a single white texel (all bits set, 0xFFFF) should
    /// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red
    /// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0
    /// B=0 via the high-bit-replicate convention.
    #[test]
    fn decode_k565_texel_expansion() {
        // Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels
        // → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white),
        // texel[1] = 0xF800 (pure red).
        let mut bytes = vec![0u8; 64];
        // 0xFFFF
        bytes[0] = 0xFF;
        bytes[1] = 0xFF;
        // 0xF800 (big-endian memory): high byte 0xF8, low 0x00.
        // But after apply_endian_32(Endian::None) we use little-endian
        // word decoding — so memory must carry the bytes in LE order.
        bytes[2] = 0x00;
        bytes[3] = 0xF8;
        let mem = FakeMem::from_vec(bytes);
        let key = TextureKey {
            base_address: 0,
            width: 2,
            height: 1,
            depth_or_slices: 1,
            format: TextureFormat::K565,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        let out = decode_k565_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 2 * 4);
        // Texel 0: white.
        assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]);
        // Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF).
        assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]);
    }
    #[test]
    fn is_host_supported_covers_m5_formats() {
        assert!(TextureFormat::K8888.is_host_supported());
        assert!(TextureFormat::K565.is_host_supported());
        assert!(TextureFormat::Dxt1.is_host_supported());
        assert!(TextureFormat::Dxt2_3.is_host_supported());
        assert!(TextureFormat::Dxt4_5.is_host_supported());
        // Unsupported formats should still report false.
        assert!(!TextureFormat::K16.is_host_supported());
        assert!(!TextureFormat::K32Float.is_host_supported());
    }
    #[test]
    fn texture_cache_caches_and_reuses() {
        let mut cache = TextureCache::new();
        let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::K8888,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        cache.ensure_cached(key, 0, &mem).unwrap();
        assert_eq!(cache.decodes_total, 1);
        // Same version: should hit cache.
        cache.ensure_cached(key, 0, &mem).unwrap();
        assert_eq!(cache.decodes_total, 1);
        // Higher version: stale → re-decode.
        cache.ensure_cached(key, 1, &mem).unwrap();
        assert_eq!(cache.decodes_total, 2);
        assert_eq!(cache.restale_total, 1);
    }
    /// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey`
    /// → `ensure_cached` on fresh/version-bumped memory → stale re-decode.
    /// Mirrors what `vd_swap` does per frame.
    #[test]
    fn e2e_fetch_const_to_cache_with_versioning() {
        // 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned.
        let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2
        let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none
        let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3
        let d5 = 1u32 << 9; // 2D
        let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded");
        assert_eq!(key.format, TextureFormat::K8888);
        assert_eq!(key.width, 4);
        let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]);
        let mut cache = TextureCache::new();
        // v0 decode.
        let first = cache
            .ensure_cached(key, 0, &mem)
            .expect("initial decode")
            .clone();
        // Same version → cache hit.
        cache.ensure_cached(key, 0, &mem).expect("hit");
        assert_eq!(cache.decodes_total, 1);
        // Simulate the guest writing to the texture's pages: version bumps.
        for b in &mem.0[..16] {
            b.set(0xFF);
        }
        cache.ensure_cached(key, 1, &mem).expect("re-decode");
        assert_eq!(cache.decodes_total, 2);
        assert_eq!(cache.restale_total, 1);
        // Bytes differ from v0 (proof the re-decode happened).
        let second = cache.get(&key).unwrap();
        assert_ne!(first.bytes, second.bytes);
    }
    #[test]
    fn texture_cache_rejects_unsupported_format() {
        let mut cache = TextureCache::new();
        let mem = FakeMem::from_vec(vec![0u8; 1024]);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::K16,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        assert!(matches!(
            cache.ensure_cached(key, 0, &mem),
            Err(DecodeError::UnsupportedFormat)
        ));
    }
 }
--- a/crates/xenia-gpu/src/tiled_address.rs
+++ b/crates/xenia-gpu/src/tiled_address.rs
@@ -0,0 +1,178 @@
 //! Xenos tiled-texture address formula (2D, Tiled2D layout).
 //!
 //! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
 //! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
 //! macro-tile pattern with bank+pipe interleave for its internal DRAM
 //! banks; this formula inverts that so we can read pixels out in linear
 //! order, given the tiled source buffer.
 //!
 //! We use this in two places during P4:
 //!  - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
 //!  - Any place we need to read tiled guest memory into a host-linear
 //!    buffer for CPU-side conversion before upload.
 /// Tile size constants from canary.
 pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
 pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px
 /// Canary's `TiledCombine` helper — reassembles the DRAM address from the
 /// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
 #[inline]
 fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
    (y_lsb << 4)
        | (pipe << 6)
        | (bank << 11)
        | (outer_inner_bytes & 0b1111)
        | (((outer_inner_bytes >> 4) & 0b1) << 5)
        | (((outer_inner_bytes >> 5) & 0b111) << 8)
        | ((outer_inner_bytes >> 8) << 12)
 }
 /// 2D tiled offset in bytes from (x, y) into a tiled surface with
 /// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
 /// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
 /// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
 /// `u32` rather than canary's signed `int` since our callers stay in
 /// unsigned arithmetic.
 ///
 /// This is the canonical formula — do not simplify without re-reading
 /// `texture_address.h:190-208`; the bit-interleave cannot be expressed
 /// as a linear function.
 pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
    let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
    // Outer: which 32×32 macro tile we're in.
    let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
        + (x >> MACRO_TILE_WIDTH_LOG2))
        << 6;
    // Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
    // because that bit becomes the `y_lsb` interleave bit below).
    let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
    let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;
    let bank = (y >> 4) & 0b1;
    let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
    let y_lsb = y & 1;
    tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
 }
 /// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
 /// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
 #[inline]
 pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
    let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
    (pitch_pixels + mask) & !mask
 }
 /// Detile a 2D tiled surface into a linear destination buffer. The
 /// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
 /// one block in the tiled source, and the detiler writes it into `dst`
 /// at the linear (x, y) position.
 ///
 /// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
 /// a single 16-bit block, etc.). `dst` must be at least
 /// `width * height * bpp` bytes long.
 ///
 /// Returns `Err(())` if the source doesn't contain enough bytes for the
 /// largest offset the formula would produce (defensive — callers can
 /// downgrade silently).
 pub fn detile_2d(
    src: &[u8],
    dst: &mut [u8],
    width: u32,
    height: u32,
    pitch_pixels: u32,
    bpp: u32,
 ) -> Result<(), ()> {
    let bpp_log2 = bpp.trailing_zeros();
    let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
    let dst_pitch_bytes = (width * bpp) as usize;
    let bpp_u = bpp as usize;
    for y in 0..height {
        for x in 0..width {
            let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
            if src_off + bpp_u > src.len() {
                return Err(());
            }
            let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
            if dst_off + bpp_u > dst.len() {
                return Err(());
            }
            dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
        }
    }
    Ok(())
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    /// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
    #[test]
    fn origin_is_zero() {
        assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
    }
    /// Round-trip: detiling a tiled buffer that was filled using the same
    /// formula produces the identity linear image.
    #[test]
    fn roundtrip_small_pattern() {
        let w = 32u32;
        let h = 16u32;
        let bpp = 4u32;
        let pitch = align_pitch_to_macro_tile(w);
        // Allocate a tiled buffer large enough for the largest offset.
        let max_off = (0..h)
            .flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
            .max()
            .unwrap();
        let mut tiled = vec![0u8; max_off];
        // Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
        // each logical (x, y) position in the tiled buffer.
        for y in 0..h {
            for x in 0..w {
                let off = tiled_2d_offset(x, y, pitch, 2) as usize;
                tiled[off + 0] = x as u8;
                tiled[off + 1] = y as u8;
                tiled[off + 2] = (x ^ y) as u8;
                tiled[off + 3] = 0xFF;
            }
        }
        let mut linear = vec![0u8; (w * h * bpp) as usize];
        detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
        // Verify every logical pixel landed at the right linear offset.
        for y in 0..h {
            for x in 0..w {
                let lin = ((y * w + x) * bpp) as usize;
                assert_eq!(linear[lin + 0], x as u8);
                assert_eq!(linear[lin + 1], y as u8);
                assert_eq!(linear[lin + 2], (x ^ y) as u8);
                assert_eq!(linear[lin + 3], 0xFF);
            }
        }
    }
    /// Within a single macro-tile row, stepping `x` by 1 changes the low
    /// 3 bits of `x` which feed the `inner_blocks` field — different
    /// offsets are expected (no aliasing).
    #[test]
    fn neighbouring_pixels_have_distinct_offsets() {
        let mut seen = std::collections::HashSet::new();
        for y in 0..16 {
            for x in 0..32 {
                assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
            }
        }
    }
    /// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
    /// rounds to 1312.
    #[test]
    fn align_pitch_rounds_up_to_32() {
        assert_eq!(align_pitch_to_macro_tile(1280), 1280);
        assert_eq!(align_pitch_to_macro_tile(1281), 1312);
        assert_eq!(align_pitch_to_macro_tile(31), 32);
    }
 }
--- a/crates/xenia-gpu/src/translator.rs
+++ b/crates/xenia-gpu/src/translator.rs
@@ -0,0 +1,557 @@
 //! Xenos → WGSL direct translator (P7).
 //!
 //! Replaces the runtime uber-shader interpreter (P3b/P3c) for shaders whose
 //! feature set we cover. Emits a *standalone* WGSL module per shader
 //! instead of walking a ucode buffer at draw time — pipeline compilation
 //! happens once, then every subsequent dispatch is a direct `draw()`.
 //!
 //! The translator is deliberately narrow: when it encounters an opcode /
 //! fetch format / CF shape it doesn't know, it returns [`None`] and the
 //! caller falls back to the interpreter. This keeps the op-coverage work
 //! incremental — future commits can add one opcode at a time without
 //! invalidating the scaffolding.
 //!
 //! Current coverage (v1):
 //!  * Linear CF: `Exec`/`ExecEnd`, `Alloc`, `Exit`. No loops / branches /
 //!    calls / predicate-gated clauses.
 //!  * ALU vector: `ADD`, `MUL`, `MAX`, `MIN`, `MAD`, `DP4`, `DP3`,
 //!    `DP2_ADD`, `SEQ`, `SGT`, `SGE`, `SNE`, `FRC`, `FLOOR`.
 //!  * ALU scalar: `ADDS`, `MULS`, `MAXS`, `MINS`, `RCP`, `RETAIN_PREV`.
 //!  * Vertex fetch: `R32G32B32A32_FLOAT` only.
 //!  * Texture fetch: 2D via the single `@group(1)` slot (same one P5/M6
 //!    binds).
 //!  * Exports: VS writes position + interpolator 0 (color); PS writes
 //!    color0.
 //!
 //! When a shader exceeds this subset, [`translate`] returns `None` and
 //! `gpu.shader.translate_reject{reason}` is bumped by the caller.
 use crate::ucode::alu::{decode_alu, sop, vop, AluInstruction};
 use crate::ucode::control_flow::{AllocKind, ControlFlowInstruction};
 use crate::ucode::fetch::{decode_fetch, FetchInstruction};
 use crate::ucode::ParsedShader;
 /// Shader stage we're emitting for.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Stage {
    Vertex,
    Pixel,
 }
 /// Success or refusal from the translator. On refusal, the caller falls
 /// back to the runtime uber-shader interpreter.
 #[derive(Debug)]
 pub enum Translation {
    /// The emitted WGSL body for *this stage only*. Both VS + PS get
    /// wrapped into one module via [`combine_stages`].
    Ok(String),
    /// Translator saw an op/pattern it doesn't handle; fallback.
    Reject(&'static str),
 }
 /// Full WGSL module for a (VS, PS) pair ready to hand to
 /// `wgpu::Device::create_shader_module`. Shares the header across the two
 /// bodies so bindings, struct declarations, and helpers aren't duplicated.
 pub fn combine_stages(vs_body: &str, ps_body: &str) -> String {
    let mut out = String::with_capacity(4096 + vs_body.len() + ps_body.len());
    out.push_str(MODULE_HEADER);
    out.push_str(vs_body);
    out.push_str(ps_body);
    out
 }
 /// Translate a single shader stage. Returns `None` on any unsupported
 /// feature with a short reason string that the caller plumbs into the
 /// `gpu.shader.translate_reject{reason}` metric.
 pub fn translate(parsed: &ParsedShader, stage: Stage) -> Translation {
    let mut ctx = EmitCtx::new(stage);
    // Emit the stage entry function body.
    if let Err(reason) = ctx.emit_stage_body(parsed) {
        return Translation::Reject(reason);
    }
    Translation::Ok(ctx.finish())
 }
 /// Reject reasons; kept as static &'str for zero-alloc metrics.
 pub mod reject {
    pub const VEC_OP_UNSUPPORTED: &str = "vec_op_unsupported";
    pub const SCL_OP_UNSUPPORTED: &str = "scl_op_unsupported";
    pub const CF_LOOP: &str = "cf_loop";
    pub const CF_COND: &str = "cf_cond";
    pub const CF_CALL: &str = "cf_call";
    pub const CF_UNKNOWN: &str = "cf_unknown";
    pub const VFETCH_FMT: &str = "vfetch_fmt";
    pub const TFETCH_NON2D: &str = "tfetch_non2d";
    pub const INSTR_OOB: &str = "instr_oob";
 }
 /// Shader-module preamble (bindings, helpers, struct defs). The bindings
 /// mirror the xenos pipeline's `@group(0)` + `@group(1)` layout from P5/M6
 /// so we can use **the same bind-group slots** — only the pipeline object
 /// differs between interpreter mode and translator mode.
 const MODULE_HEADER: &str = r#"
 struct XenosDrawConstants {
    draw_index: u32,
    vertex_count: u32,
    prim_kind: u32,
    _pad: u32,
 };
 struct XenosConstants {
    alu:         array<vec4<f32>, 512>,
    fetch:       array<u32, 256>,
    bool_consts: array<u32, 8>,
    loop_consts: array<u32, 32>,
 };
@group(0) @binding(0) var<uniform>       draw_ctx      : XenosDrawConstants;
@group(0) @binding(1) var<storage, read> xenos_consts  : XenosConstants;
@group(0) @binding(2) var<storage, read> vs_ucode      : array<u32>;
@group(0) @binding(3) var<storage, read> ps_ucode      : array<u32>;
@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
@group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
@group(1) @binding(1) var xenos_samp : sampler;
 struct VsOut {
    @builtin(position) position: vec4<f32>,
    @location(0) color: vec4<f32>,
 };
 struct FsOut {
    @location(0) color0: vec4<f32>,
 };
 // Helper: reciprocal guarded against divide-by-zero.
 fn xe_rcp(x: f32) -> f32 {
    return select(0.0, 1.0 / x, x != 0.0);
 }
 "#;
 struct EmitCtx {
    stage: Stage,
    out: String,
    indent: usize,
 }
 impl EmitCtx {
    fn new(stage: Stage) -> Self {
        Self {
            stage,
            out: String::with_capacity(2048),
            indent: 0,
        }
    }
    fn finish(self) -> String {
        self.out
    }
    fn push(&mut self, s: &str) {
        for _ in 0..self.indent {
            self.out.push_str("    ");
        }
        self.out.push_str(s);
        self.out.push('\n');
    }
    fn emit_stage_body(&mut self, parsed: &ParsedShader) -> Result<(), &'static str> {
        // Entry function + struct header.
        match self.stage {
            Stage::Vertex => {
                self.push("@vertex");
                self.push("fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {");
            }
            Stage::Pixel => {
                self.push("@fragment");
                self.push("fn fs_main(in: VsOut) -> FsOut {");
            }
        }
        self.indent = 1;
        // Register file + ps chain + export slots. All local `var`s so each
        // invocation gets its own state; translator-emitted code doesn't
        // need `var<private>` because we don't share across function calls.
        self.push("var r: array<vec4<f32>, 128>;");
        self.push("for (var i = 0u; i < 128u; i = i + 1u) { r[i] = vec4<f32>(0.0); }");
        self.push("var ps: f32 = 0.0;");
        match self.stage {
            Stage::Vertex => {
                // Seed r0 with vertex index for simple shaders that read it.
                self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
                // Synthetic export slots — match the interpreter's layout so
                // the fallback path and translator path produce the same
                // visual output on shaders both support.
                self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
                self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
            }
            Stage::Pixel => {
                // Seed r0.xy with interpolated color lane so trivial shaders
                // that read r0 still produce something.
                self.push("r[0] = in.color;");
                self.push("var ocolor0: vec4<f32> = in.color;");
            }
        }
        let mut current_alloc = AllocKind::Other;
        for clause in &parsed.cf {
            match clause {
                ControlFlowInstruction::Exec {
                    address,
                    count,
                    sequence,
                    is_end,
                    predicated,
                    ..
                } => {
                    if *predicated {
                        return Err(reject::CF_COND);
                    }
                    self.emit_exec(parsed, *address, *count, *sequence, current_alloc)?;
                    if *is_end {
                        break;
                    }
                }
                ControlFlowInstruction::Alloc { kind, .. } => {
                    current_alloc = *kind;
                }
                ControlFlowInstruction::Exit => break,
                ControlFlowInstruction::LoopStart { .. }
                | ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
                ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
                ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
                    return Err(reject::CF_CALL);
                }
                ControlFlowInstruction::Unknown { .. } => return Err(reject::CF_UNKNOWN),
            }
        }
        match self.stage {
            Stage::Vertex => {
                self.push("var out: VsOut;");
                self.push("out.position = opos;");
                self.push("out.color = ocolor;");
                self.push("return out;");
            }
            Stage::Pixel => {
                self.push("var out: FsOut;");
                self.push("out.color0 = ocolor0;");
                self.push("return out;");
            }
        }
        self.indent = 0;
        self.push("}");
        Ok(())
    }
    fn emit_exec(
        &mut self,
        parsed: &ParsedShader,
        address: u32,
        count: u32,
        sequence: u32,
        current_alloc: AllocKind,
    ) -> Result<(), &'static str> {
        for i in 0..(count as usize) {
            let triple_idx = address as usize + i;
            let base = triple_idx * 3;
            if base + 2 >= parsed.instructions.len() {
                return Err(reject::INSTR_OOB);
            }
            let words = [
                parsed.instructions[base],
                parsed.instructions[base + 1],
                parsed.instructions[base + 2],
            ];
            let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
            if is_fetch {
                match decode_fetch(words) {
                    FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
                    FetchInstruction::Texture(tf) => {
                        if tf.dimension != 1 {
                            return Err(reject::TFETCH_NON2D);
                        }
                        self.emit_tfetch(&tf);
                    }
                    FetchInstruction::Unknown { .. } => return Err(reject::VFETCH_FMT),
                }
            } else {
                let alu = decode_alu(words);
                self.emit_alu(&alu, current_alloc)?;
            }
        }
        Ok(())
    }
    fn emit_alu(
        &mut self,
        alu: &AluInstruction,
        current_alloc: AllocKind,
    ) -> Result<(), &'static str> {
        let a = format!("r[{}u]", alu.src_a & 0x7F);
        let b = format!("r[{}u]", alu.src_b & 0x7F);
        let c = format!("r[{}u]", alu.src_c & 0x7F);
        // Vector pipe.
        if alu.vector_write_mask != 0 {
            let expr = vector_expr(alu.vector_opcode, &a, &b, &c)
                .ok_or(reject::VEC_OP_UNSUPPORTED)?;
            let dst_reg = alu.vector_dest & 0x7F;
            if alu.vector_dest_is_export {
                self.emit_export(dst_reg, current_alloc, &expr, alu.vector_write_mask);
            } else {
                self.emit_masked_write(&format!("r[{dst_reg}u]"), &expr, alu.vector_write_mask);
            }
        }
        // Scalar pipe. Binary ops use (src_a.x, src_b.x); ps-variants use
        // src_a.x + running ps. `scl_src_a` mirrors the interpreter's
        // `scalar_src_is_ps` selector.
        let scl_src_a = if alu.scalar_src_is_ps {
            "ps".to_string()
        } else {
            format!("{}.x", a)
        };
        let scl_src_b = format!("{}.x", b);
        let expr = scalar_expr(alu.scalar_opcode, &scl_src_a, &scl_src_b, "ps")
            .ok_or(reject::SCL_OP_UNSUPPORTED)?;
        self.push(&format!("ps = {expr};"));
        if alu.scalar_write_mask != 0 {
            let v = "vec4<f32>(ps, ps, ps, ps)";
            let dst_reg = alu.scalar_dest & 0x7F;
            self.emit_masked_write(&format!("r[{dst_reg}u]"), v, alu.scalar_write_mask);
        }
        Ok(())
    }
    fn emit_masked_write(&mut self, lhs: &str, rhs: &str, mask: u8) {
        if mask == 0xF {
            self.push(&format!("{lhs} = {rhs};"));
            return;
        }
        self.push(&"{".to_string());
        self.indent += 1;
        self.push(&format!("let _prev = {lhs};"));
        self.push(&format!("let _new = {rhs};"));
        let mut components = Vec::new();
        let letters = ['x', 'y', 'z', 'w'];
        for (i, c) in letters.iter().enumerate() {
            if (mask >> i) & 1 == 1 {
                components.push(format!("_new.{c}"));
            } else {
                components.push(format!("_prev.{c}"));
            }
        }
        self.push(&format!(
            "{lhs} = vec4<f32>({}, {}, {}, {});",
            components[0], components[1], components[2], components[3]
        ));
        self.indent -= 1;
        self.push("}");
    }
    fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
        // Xenos's export "register" indexing within an alloc range is
        // normally (alloc_base + offset). Since our CF stream doesn't
        // carry per-export slot offsets cleanly, use `alloc` to pick the
        // target.
        let lhs = match (self.stage, alloc) {
            (Stage::Vertex, AllocKind::Position) => "opos",
            (Stage::Vertex, AllocKind::Interpolators) => "ocolor",
            (Stage::Vertex, AllocKind::Colors) => "ocolor",
            (Stage::Vertex, _) => "ocolor", // fall through — any other alloc
            (Stage::Pixel, AllocKind::Colors) => "ocolor0",
            (Stage::Pixel, _) => "ocolor0",
        };
        let _ = dst_reg; // per-slot export indexing reserved for a richer v2
        self.emit_masked_write(lhs, expr, mask);
    }
    fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
        // v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
        // dwords. Matches the interpreter's MVP semantics; unlocks more
        // formats alongside the CPU texture cache's format expansion.
        let fetch_const = (vf.raw[0] >> 5) & 0x1F;
        let src_reg = vf.src_register & 0x7F;
        let dst_reg = vf.dest_register & 0x7F;
        self.push(&format!(
            "{{ let fc0 = xenos_consts.fetch[{}u]; \
             let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
             let vidx = u32(r[{src_reg}u].x); \
             let addr = base + vidx * 4u; \
             let n = arrayLength(&vertex_buffer); \
             if (addr + 3u < n) {{ \
                 r[{dst_reg}u] = vec4<f32>( \
                     bitcast<f32>(vertex_buffer[addr + 0u]), \
                     bitcast<f32>(vertex_buffer[addr + 1u]), \
                     bitcast<f32>(vertex_buffer[addr + 2u]), \
                     bitcast<f32>(vertex_buffer[addr + 3u])); \
             }} }}",
            fetch_const * 2,
        ));
        Ok(())
    }
    fn emit_tfetch(&mut self, tf: &crate::ucode::fetch::TextureFetch) {
        // v1: sample the single bound texture; UV = r[src].xy. P5's cache
        // publishes the `fetch_const=0` texture into `@group(1)`; slot
        // mismatch is a silent magenta for now.
        let src_reg = tf.src_register & 0x7F;
        let dst_reg = tf.dest_register & 0x7F;
        self.push(&format!(
            "r[{dst_reg}u] = textureSampleLevel(xenos_tex, xenos_samp, r[{src_reg}u].xy, 0.0);"
        ));
    }
 }
 fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
    let s = match op {
        vop::ADD => format!("({a} + {b})"),
        vop::MUL => format!("({a} * {b})"),
        vop::MAX => format!("max({a}, {b})"),
        vop::MIN => format!("min({a}, {b})"),
        vop::MAD => format!("({a} * {b} + {c})"),
        vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
        vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
        vop::DOT2_ADD => format!(
            "vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"
        ),
        vop::SEQ => format!(
            "vec4<f32>(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))"
        ),
        vop::SGT => format!(
            "vec4<f32>(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))"
        ),
        vop::SGE => format!(
            "vec4<f32>(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))"
        ),
        vop::SNE => format!(
            "vec4<f32>(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))"
        ),
        vop::FRC => format!("fract({a})"),
        vop::FLOOR => format!("floor({a})"),
        _ => return None,
    };
    Some(s)
 }
 fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
    let s = match op {
        sop::ADDS => format!("({a} + {b})"),
        sop::ADDS_PREV => format!("({a} + {prev})"),
        sop::MULS => format!("({a} * {b})"),
        sop::MULS_PREV => format!("({a} * {prev})"),
        sop::MAXS => format!("max({a}, {b})"),
        sop::MINS => format!("min({a}, {b})"),
        sop::RCP => format!("xe_rcp({a})"),
        sop::RETAIN_PREV => prev.to_string(),
        _ => return None,
    };
    Some(s)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::ucode::alu::{sop, vop};
    use crate::ucode::control_flow::ControlFlowInstruction;
    fn synthetic_trivial_shader() -> ParsedShader {
        // Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
        // with full write-mask on vector, zero on scalar. Alloc(Position)
        // precedes so the ALU's export (if it were one) would target oPos.
        let w2 = (vop::ADD as u32)
            | ((sop::RETAIN_PREV as u32) << 6)
            | (0xF << 12) // vector_write_mask
            | (0u32 << 16); // vector_dest = 0
        ParsedShader {
            cf: vec![
                ControlFlowInstruction::Alloc {
                    size: 1,
                    kind: AllocKind::Position,
                },
                ControlFlowInstruction::Exec {
                    address: 0,
                    count: 1,
                    sequence: 0,
                    is_end: true,
                    predicated: false,
                    predicate_condition: false,
                },
            ],
            instructions: vec![0, 0, w2],
        }
    }
    #[test]
    fn trivial_shader_translates() {
        let shader = synthetic_trivial_shader();
        match translate(&shader, Stage::Vertex) {
            Translation::Ok(body) => {
                assert!(body.contains("fn vs_main"));
                assert!(body.contains("r[0u] = (r[0u] + r[0u]);"));
                assert!(body.contains("return out;"));
            }
            Translation::Reject(r) => panic!("rejected: {r}"),
        }
    }
    #[test]
    fn combined_module_parses_as_wgsl() {
        let shader = synthetic_trivial_shader();
        let vs = match translate(&shader, Stage::Vertex) {
            Translation::Ok(body) => body,
            Translation::Reject(r) => panic!("VS rejected: {r}"),
        };
        let ps = match translate(&shader, Stage::Pixel) {
            Translation::Ok(body) => body,
            Translation::Reject(r) => panic!("PS rejected: {r}"),
        };
        let module = combine_stages(&vs, &ps);
        // naga is pinned as a dev-dep in this crate; if this fails the
        // translator is emitting invalid WGSL.
        match naga::front::wgsl::parse_str(&module) {
            Ok(_) => {}
            Err(e) => panic!(
                "emitted WGSL failed to parse:\n{}\n--- module ---\n{}",
                e, module
            ),
        }
    }
    #[test]
    fn loop_clause_rejected() {
        let shader = ParsedShader {
            cf: vec![ControlFlowInstruction::LoopStart {
                address: 0,
                loop_id: 0,
            }],
            instructions: vec![],
        };
        assert!(matches!(
            translate(&shader, Stage::Vertex),
            Translation::Reject(reject::CF_LOOP)
        ));
    }
    #[test]
    fn unsupported_op_rejected() {
        let w2 = (29u32) // VOP_MAX_A, not in v1 subset
            | ((sop::RETAIN_PREV as u32) << 6)
            | (0xF << 12);
        let shader = ParsedShader {
            cf: vec![ControlFlowInstruction::Exec {
                address: 0,
                count: 1,
                sequence: 0,
                is_end: true,
                predicated: false,
                predicate_condition: false,
            }],
            instructions: vec![0, 0, w2],
        };
        assert!(matches!(
            translate(&shader, Stage::Vertex),
            Translation::Reject(reject::VEC_OP_UNSUPPORTED)
        ));
    }
 }
--- a/crates/xenia-gpu/src/ucode/alu.rs
+++ b/crates/xenia-gpu/src/ucode/alu.rs
@@ -0,0 +1,206 @@
 //! Xenos ALU (vector + scalar) instruction decoder.
 //!
 //! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
 //!   - word0: operand modifier flags + destination info
 //!   - word1: source register / swizzle fields
 //!   - word2: opcode + write mask + export target
 //!
 //! See `ucode.h:900-1400` for the full field map. This decoder captures the
 //! minimal shape the uber-shader needs; flags we don't interpret yet are
 //! retained as raw bits in `raw` for downstream inspection.
 /// Decoded ALU instruction.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct AluInstruction {
    /// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
    pub vector_opcode: u8,
    /// Scalar ALU opcode (bits 7..13 of word2).
    pub scalar_opcode: u8,
    /// Destination register index for vector result (7 bits).
    pub vector_dest: u8,
    /// Destination register index for scalar result (7 bits).
    pub scalar_dest: u8,
    /// 4-bit write mask for the vector result (x/y/z/w).
    pub vector_write_mask: u8,
    /// 4-bit write mask for the scalar result.
    pub scalar_write_mask: u8,
    /// Set when the instruction should write to the export bank (position,
    /// interpolators, color, etc.) instead of the general register file.
    pub vector_dest_is_export: bool,
    /// Selects `ps` (previous scalar result) as the scalar operand when set.
    pub scalar_src_is_ps: bool,
    /// Source register indices (at most 3 for vector ops).
    pub src_a: u8,
    pub src_b: u8,
    pub src_c: u8,
    /// Set when the instruction is predicated; skipped if the predicate
    /// doesn't match `predicate_condition`.
    pub predicated: bool,
    pub predicate_condition: bool,
    /// Raw dwords — preserved verbatim so the translator / interpreter can
    /// reach into fields we haven't parsed explicitly yet.
    pub raw: [u32; 3],
 }
 /// Decode a 3-dword ALU triple.
 pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
    let w0 = words[0];
    let _w1 = words[1];
    let w2 = words[2];
    AluInstruction {
        vector_opcode: (w2 & 0x3F) as u8,
        scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
        vector_dest: ((w2 >> 16) & 0x7F) as u8,
        scalar_dest: ((w2 >> 24) & 0x7F) as u8,
        vector_write_mask: ((w2 >> 12) & 0xF) as u8,
        scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
        vector_dest_is_export: ((w2 >> 23) & 1) != 0,
        scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
        src_a: (w0 & 0xFF) as u8,
        src_b: ((w0 >> 8) & 0xFF) as u8,
        src_c: ((w0 >> 16) & 0xFF) as u8,
        predicated: ((w0 >> 27) & 1) != 0,
        predicate_condition: ((w0 >> 28) & 1) != 0,
        raw: words,
    }
 }
 /// Vector ALU opcodes we reference by name. Values match canary's
 /// `AluVectorOpcode` enum in `ucode.h:1354`.
 pub mod vop {
    pub const ADD: u8 = 0;
    pub const MUL: u8 = 1;
    pub const MAX: u8 = 2;
    pub const MIN: u8 = 3;
    pub const SEQ: u8 = 4;
    pub const SGT: u8 = 5;
    pub const SGE: u8 = 6;
    pub const SNE: u8 = 7;
    pub const FRC: u8 = 8;
    pub const TRUNC: u8 = 9;
    pub const FLOOR: u8 = 10;
    pub const MAD: u8 = 11;
    pub const CND_EQ: u8 = 12;
    pub const CND_GE: u8 = 13;
    pub const CND_GT: u8 = 14;
    pub const DOT4: u8 = 15;
    pub const DOT3: u8 = 16;
    pub const DOT2_ADD: u8 = 17;
    pub const CUBE: u8 = 18;
    pub const MAX4: u8 = 19;
    pub const SETP_EQ_PUSH: u8 = 20;
    pub const SETP_NE_PUSH: u8 = 21;
    pub const SETP_GT_PUSH: u8 = 22;
    pub const SETP_GE_PUSH: u8 = 23;
    pub const KILL_EQ: u8 = 24;
    pub const KILL_GT: u8 = 25;
    pub const KILL_GE: u8 = 26;
    pub const KILL_NE: u8 = 27;
    pub const DST: u8 = 28;
    pub const MAX_A: u8 = 29;
 }
 /// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
 /// `ucode.h:1001`.
 pub mod sop {
    pub const ADDS: u8 = 0;
    pub const ADDS_PREV: u8 = 1;
    pub const MULS: u8 = 2;
    pub const MULS_PREV: u8 = 3;
    pub const MULS_PREV2: u8 = 4;
    pub const MAXS: u8 = 5;
    pub const MINS: u8 = 6;
    pub const SEQS: u8 = 7;
    pub const SGTS: u8 = 8;
    pub const SGES: u8 = 9;
    pub const SNES: u8 = 10;
    pub const FRCS: u8 = 11;
    pub const TRUNCS: u8 = 12;
    pub const FLOORS: u8 = 13;
    pub const EXP: u8 = 14;
    pub const LOGC: u8 = 15;
    pub const LOG: u8 = 16;
    pub const RCPC: u8 = 17;
    pub const RCPF: u8 = 18;
    pub const RCP: u8 = 19;
    pub const RSQC: u8 = 20;
    pub const RSQF: u8 = 21;
    pub const RSQ: u8 = 22;
    pub const MAXAS: u8 = 23;
    pub const MAXASF: u8 = 24;
    pub const SUBS: u8 = 25;
    pub const SUBS_PREV: u8 = 26;
    pub const SETP_EQ: u8 = 27;
    pub const SETP_NE: u8 = 28;
    pub const SETP_GT: u8 = 29;
    pub const SETP_GE: u8 = 30;
    pub const SETP_INV: u8 = 31;
    pub const SETP_POP: u8 = 32;
    pub const SETP_CLR: u8 = 33;
    pub const SETP_RSTR: u8 = 34;
    pub const KILLS_EQ: u8 = 35;
    pub const KILLS_GT: u8 = 36;
    pub const KILLS_GE: u8 = 37;
    pub const KILLS_NE: u8 = 38;
    pub const KILLS_ONE: u8 = 39;
    pub const SQRT: u8 = 40;
    pub const MULSC0: u8 = 42;
    pub const MULSC1: u8 = 43;
    pub const ADDSC0: u8 = 44;
    pub const ADDSC1: u8 = 45;
    pub const SUBSC0: u8 = 46;
    pub const SUBSC1: u8 = 47;
    pub const SIN: u8 = 48;
    pub const COS: u8 = 49;
    pub const RETAIN_PREV: u8 = 50;
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    /// Regression: our table previously drifted from canary's values (e.g.
    /// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
    /// the most-often-used scalar + vector opcodes here.
    #[test]
    fn opcodes_match_canary_values() {
        // Scalar.
        assert_eq!(sop::MAXS, 5);
        assert_eq!(sop::MINS, 6);
        assert_eq!(sop::SEQS, 7);
        assert_eq!(sop::EXP, 14);
        assert_eq!(sop::LOG, 16);
        assert_eq!(sop::RCP, 19);
        assert_eq!(sop::RSQ, 22);
        assert_eq!(sop::SUBS, 25);
        assert_eq!(sop::SETP_EQ, 27);
        assert_eq!(sop::KILLS_EQ, 35);
        assert_eq!(sop::SQRT, 40);
        assert_eq!(sop::SIN, 48);
        assert_eq!(sop::RETAIN_PREV, 50);
        // Vector.
        assert_eq!(vop::SNE, 7);
        assert_eq!(vop::CND_EQ, 12);
        assert_eq!(vop::MAX4, 19);
        assert_eq!(vop::KILL_EQ, 24);
        assert_eq!(vop::DST, 28);
    }
    #[test]
    fn decode_extracts_opcodes_and_dests() {
        // Build a minimal ALU word:
        //   vector_opcode = ADD (0), scalar_opcode = RCP (22),
        //   vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
        let w2 = (vop::ADD as u32)
            | ((sop::RCP as u32) << 6)
            | (0xF << 12) // vector_write_mask
            | (3u32 << 16) // vector_dest
            | (7u32 << 24); // scalar_dest
        let alu = decode_alu([0, 0, w2]);
        assert_eq!(alu.vector_opcode, vop::ADD);
        assert_eq!(alu.scalar_opcode, sop::RCP);
        assert_eq!(alu.vector_dest, 3);
        assert_eq!(alu.scalar_dest, 7);
        assert_eq!(alu.vector_write_mask, 0xF);
    }
 }
--- a/crates/xenia-gpu/src/ucode/control_flow.rs
+++ b/crates/xenia-gpu/src/ucode/control_flow.rs
@@ -0,0 +1,173 @@
 //! Xenos control-flow clause decoder.
 //!
 //! A shader's CF block is a sequence of 48-bit clauses packed two-per-
 //! three-dword row. Each clause encodes an opcode and type-specific fields
 //! (exec addr/count, loop start/end, branch target, etc.).
 //!
 //! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset
 //! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`,
 //! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the
 //! translator can log + degrade.
 /// Parsed representation of one CF clause.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum ControlFlowInstruction {
    /// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions.
    Exec {
        /// Instruction-block dword index where this clause's instructions start,
        /// expressed in **triple units** (each inst = 3 dwords).
        address: u32,
        /// Number of triples to execute.
        count: u32,
        /// The ALU-vs-fetch sequence bitmap (2 bits per instruction).
        sequence: u32,
        /// True when this clause ends the shader.
        is_end: bool,
        /// True if predicated; skip when predicate != predicate_condition.
        predicated: bool,
        predicate_condition: bool,
    },
    /// `kLoopStart` — begin a `aL` loop referencing a loop constant.
    LoopStart { address: u32, loop_id: u32 },
    /// `kLoopEnd` — close the loop; `address` points at the matching start.
    LoopEnd { address: u32, loop_id: u32 },
    /// `kCondJmp` — conditional jump to another CF index.
    CondJmp {
        target: u32,
        predicated: bool,
        predicate_condition: bool,
    },
    /// `kCondCall` — call into another CF subroutine.
    CondCall { target: u32 },
    /// `kReturn` — return from subroutine.
    Return,
    /// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
    Alloc { size: u32, kind: AllocKind },
    /// Exit the shader (terminal).
    Exit,
    /// Unknown / unhandled opcode.
    Unknown { opcode: u8 },
 }
 /// Export target types for `kAlloc` clauses.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum AllocKind {
    Position,
    Interpolators,
    Colors,
    Memexport,
    Other,
 }
 impl AllocKind {
    fn from_bits(b: u32) -> Self {
        match b & 0x7 {
            0 => AllocKind::Position,
            1 => AllocKind::Interpolators,
            2 => AllocKind::Colors,
            3 => AllocKind::Memexport,
            _ => AllocKind::Other,
        }
    }
 }
 /// Decode one row (three consecutive CF dwords) into two CF clauses.
 ///
 /// Word layout per canary (`ucode.h:218-256`):
 ///   - word0 + lo16(word1) → CF_A's 48-bit payload
 ///   - hi16(word1) + word2 → CF_B's 48-bit payload
 ///
 /// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47).
 pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) {
    // Build each 48-bit value as u64; LE within the clause.
    let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32);
    let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16);
    (decode_single(a), decode_single(b))
 }
 fn decode_single(payload: u64) -> ControlFlowInstruction {
    // Top 4 bits of the 48-bit payload.
    let opcode = ((payload >> 44) & 0xF) as u8;
    // Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
    // extraction — good enough for the interpreter, which logs unknowns.
    let predicated = ((payload >> 28) & 1) != 0;
    let predicate_condition = ((payload >> 29) & 1) != 0;
    match opcode {
        0 => ControlFlowInstruction::Exec {
            address: (payload & 0xFFF) as u32,
            count: ((payload >> 12) & 0x7) as u32,
            sequence: ((payload >> 16) & 0xFFF) as u32,
            is_end: false,
            predicated,
            predicate_condition,
        },
        1 => ControlFlowInstruction::Exit,
        2 => ControlFlowInstruction::Exec {
            address: (payload & 0xFFF) as u32,
            count: ((payload >> 12) & 0x7) as u32,
            sequence: ((payload >> 16) & 0xFFF) as u32,
            is_end: true,
            predicated,
            predicate_condition,
        },
        6 => ControlFlowInstruction::LoopStart {
            address: (payload & 0x3FF) as u32,
            loop_id: ((payload >> 16) & 0x1F) as u32,
        },
        7 => ControlFlowInstruction::LoopEnd {
            address: (payload & 0x3FF) as u32,
            loop_id: ((payload >> 16) & 0x1F) as u32,
        },
        8 => ControlFlowInstruction::CondCall {
            target: (payload & 0x3FF) as u32,
        },
        9 => ControlFlowInstruction::Return,
        10 => ControlFlowInstruction::CondJmp {
            target: (payload & 0x3FF) as u32,
            predicated,
            predicate_condition,
        },
        12 => ControlFlowInstruction::Alloc {
            size: (payload & 0x7) as u32,
            kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
        },
        other => ControlFlowInstruction::Unknown { opcode: other },
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn opcode_exit_decodes() {
        // opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
        let payload: u64 = 1u64 << 44;
        let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
        let cf = decode_cf_pair(hi, lo, 0).0;
        assert_eq!(cf, ControlFlowInstruction::Exit);
    }
    #[test]
    fn opcode_exec_end_carries_address_count() {
        // opcode 2 (ExecEnd), address=4, count=2, sequence=0.
        let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4;
        let hi = (payload & 0xFFFF_FFFF) as u32;
        let lo = ((payload >> 32) & 0xFFFF) as u32;
        let cf = decode_cf_pair(hi, lo, 0).0;
        match cf {
            ControlFlowInstruction::Exec {
                address,
                count,
                is_end,
                ..
            } => {
                assert_eq!(address, 4);
                assert_eq!(count, 2);
                assert!(is_end);
            }
            other => panic!("expected Exec, got {other:?}"),
        }
    }
 }
--- a/crates/xenia-gpu/src/ucode/fetch.rs
+++ b/crates/xenia-gpu/src/ucode/fetch.rs
@@ -0,0 +1,117 @@
 //! Xenos fetch (vertex + texture) instruction decoder.
 //!
 //! Like ALU instructions, fetches are 96 bits (3 dwords). The opcode lives
 //! in the low 5 bits of word0. We split them into `VertexFetch` and
 //! `TextureFetch` structurally because their operand layouts differ.
 //!
 //! Reference: `xenia-canary/src/xenia/gpu/ucode.h:690-877`.
 /// Decoded fetch instruction.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum FetchInstruction {
    Vertex(VertexFetch),
    Texture(TextureFetch),
    /// Unknown / minor variants we don't model yet.
    Unknown { opcode: u8, raw: [u32; 3] },
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct VertexFetch {
    /// Vertex fetch constant index (0..=95).
    pub fetch_const: u8,
    /// Source register index (vertex index in r#).
    pub src_register: u8,
    /// Destination register for the fetched value.
    pub dest_register: u8,
    /// 4-bit write mask.
    pub dest_write_mask: u8,
    pub raw: [u32; 3],
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct TextureFetch {
    /// Texture fetch constant index (0..=31).
    pub fetch_const: u8,
    pub src_register: u8,
    pub dest_register: u8,
    pub dest_write_mask: u8,
    /// Dimension: 0=1D, 1=2D, 2=3D/stacked, 3=cube.
    pub dimension: u8,
    pub raw: [u32; 3],
 }
 /// Opcodes (low 5 bits of word0). From `ucode.h`.
 pub mod op {
    pub const VERTEX_FETCH: u8 = 0x00;
    pub const TEXTURE_FETCH: u8 = 0x01;
    pub const GET_TEXTURE_BORDER_COLOR_FRAC: u8 = 0x16;
    pub const GET_TEXTURE_COMPUTED_LOD: u8 = 0x17;
    pub const GET_TEXTURE_WEIGHTS: u8 = 0x18;
    pub const GET_TEXTURE_GRADIENTS: u8 = 0x19;
    pub const SET_TEXTURE_LOD: u8 = 0x1A;
    pub const SET_TEXTURE_GRADIENTS_HORZ: u8 = 0x1B;
    pub const SET_TEXTURE_GRADIENTS_VERT: u8 = 0x1C;
 }
 pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
    let w0 = words[0];
    let w1 = words[1];
    let opcode = (w0 & 0x1F) as u8;
    match opcode {
        op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
            fetch_const: ((w0 >> 5) & 0x1F) as u8,
            src_register: ((w0 >> 17) & 0x7F) as u8,
            dest_register: ((w0 >> 10) & 0x7F) as u8,
            dest_write_mask: ((w1 >> 23) & 0xF) as u8,
            raw: words,
        }),
        op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
            fetch_const: ((w0 >> 5) & 0x1F) as u8,
            src_register: ((w0 >> 17) & 0x7F) as u8,
            dest_register: ((w0 >> 10) & 0x7F) as u8,
            dest_write_mask: ((w1 >> 23) & 0xF) as u8,
            dimension: ((w1 >> 29) & 0x3) as u8,
            raw: words,
        }),
        _ => FetchInstruction::Unknown { opcode, raw: words },
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn decode_vertex_fetch() {
        // opcode=0 (vertex), fetch_const=5, src=2, dest=7.
        let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
        let v = decode_fetch([w0, 0, 0]);
        match v {
            FetchInstruction::Vertex(vf) => {
                assert_eq!(vf.fetch_const, 5);
                assert_eq!(vf.src_register, 2);
                assert_eq!(vf.dest_register, 7);
            }
            other => panic!("expected Vertex, got {other:?}"),
        }
    }
    #[test]
    fn decode_texture_fetch() {
        let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
        let t = decode_fetch([w0, (2u32 << 29), 0]);
        match t {
            FetchInstruction::Texture(tf) => {
                assert_eq!(tf.fetch_const, 3);
                assert_eq!(tf.dimension, 2);
            }
            other => panic!("expected Texture, got {other:?}"),
        }
    }
    #[test]
    fn unknown_opcode_is_classified() {
        let v = decode_fetch([0x16, 0, 0]); // GET_TEXTURE_BORDER_COLOR_FRAC
        assert!(matches!(v, FetchInstruction::Unknown { opcode: 0x16, .. }));
    }
 }
--- a/crates/xenia-gpu/src/ucode/mod.rs
+++ b/crates/xenia-gpu/src/ucode/mod.rs
@@ -0,0 +1,249 @@
 //! Xenos (ATI R500-family) shader microcode decoder.
 //!
 //! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
 //! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
 //! instructions (vector + scalar pipes), and fetch instructions (vertex +
 //! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
 //! translator comes online in P7, it reuses the same parser.
 //!
 //! ## Binary layout
 //!
 //! A compiled shader has two sections back-to-back:
 //!
 //! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
 //!    two clauses into three 32-bit words:
 //!    ```text
 //!    word0  word1  word2
 //!    [-CF_A (48)-][-CF_B (48)-]
 //!    ```
 //!    Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
 //!    its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
 //!
 //! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
 //!    instructions. Each control-flow clause of kind `Exec*` references a
 //!    contiguous range of these by `(address, count)` in dwords * 3.
 //!
 //! We read big-endian dwords straight out of guest memory (the `raw`
 //! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
 //! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
 //! layout and `:700-877` for the fetch/ALU mix.
 pub mod alu;
 pub mod control_flow;
 pub mod fetch;
 use self::alu::AluInstruction;
 use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
 use self::fetch::FetchInstruction;
 /// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
 /// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
 pub mod cf_kind {
    pub const EXEC: u32 = 0;
    pub const EXEC_END: u32 = 1;
    pub const ALLOC: u32 = 2;
    pub const EXIT: u32 = 3;
    pub const LOOP_START: u32 = 4;
    pub const LOOP_END: u32 = 5;
    pub const COND_JMP: u32 = 6;
    pub const COND_CALL: u32 = 7;
    pub const RETURN: u32 = 8;
    pub const UNKNOWN: u32 = 15;
 }
 /// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
 pub mod cf_alloc_kind {
    pub const POSITION: u32 = 0;
    pub const INTERPOLATORS: u32 = 1;
    pub const COLORS: u32 = 2;
    pub const MEMEXPORT: u32 = 3;
    pub const OTHER: u32 = 4;
 }
 /// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
 /// interpreter expects:
 ///
 /// ```text
 /// [0]                     cf_count
 /// [1 .. 1 + cf_count*3]   CF table: (kind, primary, aux) triples per clause
 /// [1 + cf_count*3 ..]     raw 3-dword instruction stream (ALU/fetch)
 /// ```
 ///
 /// The CF table lets WGSL walk clauses without reconstructing bit-packed
 /// layouts on the GPU. Semantics per `kind`:
 ///
 /// | kind        | primary                    | aux                          |
 /// |-------------|----------------------------|------------------------------|
 /// | EXEC/EXEC_END | address (in triples)      | (sequence<<8) \| count       |
 /// | ALLOC       | alloc_kind (see cf_alloc_kind) | size                    |
 /// | EXIT        | 0                          | 0                            |
 /// | LOOP_START  | address                    | loop_id                      |
 /// | LOOP_END    | address                    | loop_id                      |
 /// | COND_JMP    | target                     | predicate flags              |
 /// | COND_CALL   | target                     | 0                            |
 /// | RETURN      | 0                          | 0                            |
 /// | UNKNOWN     | opcode                     | 0                            |
 pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
    let cf_count = parsed.cf.len() as u32;
    let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
    out.push(cf_count);
    for clause in &parsed.cf {
        let (kind, primary, aux) = encode_cf(*clause);
        out.push(kind);
        out.push(primary);
        out.push(aux);
    }
    out.extend_from_slice(&parsed.instructions);
    out
 }
 fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
    use ControlFlowInstruction::*;
    match c {
        Exec {
            address,
            count,
            sequence,
            is_end,
            predicated,
            predicate_condition,
        } => {
            let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
            let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
                | (pred_bits << 8);
            (kind, address, (sequence << 8) | count)
        }
        Alloc { size, kind } => {
            let akind = match kind {
                AllocKind::Position => cf_alloc_kind::POSITION,
                AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
                AllocKind::Colors => cf_alloc_kind::COLORS,
                AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
                AllocKind::Other => cf_alloc_kind::OTHER,
            };
            (cf_kind::ALLOC, akind, size)
        }
        Exit => (cf_kind::EXIT, 0, 0),
        LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
        LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
        CondJmp {
            target,
            predicated,
            predicate_condition,
        } => {
            let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
            (cf_kind::COND_JMP, target, pred_bits)
        }
        CondCall { target } => (cf_kind::COND_CALL, target, 0),
        Return => (cf_kind::RETURN, 0, 0),
        Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
    }
 }
 /// One instruction word set from the instruction-block section. Xenos packs
 /// ALU and fetch instructions identically (96 bits each); the owning exec
 /// clause's "sequence" bitmap decides which is which.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum DecodedInstruction {
    /// ALU pipe (vector ALU + optional co-issued scalar ALU).
    Alu(AluInstruction),
    /// Vertex or texture fetch.
    Fetch(FetchInstruction),
 }
 /// Parsed shader: the control-flow clause list + the raw 32-bit instruction
 /// words. The uber-shader / translator is expected to index into
 /// `instructions` based on `(clause.address * 3, clause.count * 3)`.
 #[derive(Debug, Clone, Default)]
 pub struct ParsedShader {
    pub cf: Vec<ControlFlowInstruction>,
    /// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
    /// instruction; the owning `Exec` clause's `sequence` bitmap picks the
    /// kind.
    pub instructions: Vec<u32>,
 }
 /// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
 /// microcode buffer (control flow + instructions). Heuristic: CF dword count
 /// is encoded in the first word's low 12 bits of the last exec clause —
 /// canary iterates until it hits a clause of kind `Exit`. We do the same.
 pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
    let mut cf = Vec::new();
    // CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
    // layout). Walk pairs of 3 dwords per pair of clauses.
    let mut i = 0usize;
    while i + 2 < raw_dwords.len() {
        let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
        let (first, second) = a;
        let seen_exit = matches!(
            first,
            ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
        ) || matches!(
            second,
            ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
        );
        cf.push(first);
        cf.push(second);
        i += 3;
        if seen_exit {
            break;
        }
    }
    // Everything after `i` dwords is the instruction block.
    let instructions = raw_dwords[i..].to_vec();
    ParsedShader { cf, instructions }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn empty_blob_parses_empty() {
        let p = parse_shader(&[]);
        assert!(p.cf.is_empty());
        assert!(p.instructions.is_empty());
    }
    #[test]
    fn pack_for_wgsl_layout_is_correct() {
        // Build a tiny ParsedShader by hand and verify the packed form.
        let parsed = ParsedShader {
            cf: vec![
                ControlFlowInstruction::Exec {
                    address: 0x10,
                    count: 3,
                    sequence: 0b1010,
                    is_end: false,
                    predicated: false,
                    predicate_condition: false,
                },
                ControlFlowInstruction::Exit,
            ],
            instructions: vec![0x1111, 0x2222, 0x3333],
        };
        let packed = pack_for_wgsl(&parsed);
        assert_eq!(packed[0], 2, "cf_count");
        // First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
        assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
        assert_eq!(packed[2], 0x10);
        assert_eq!(packed[3], (0b1010 << 8) | 3);
        // Second clause: EXIT
        assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
        // Instruction block starts at 1 + 2*3 = 7
        assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
    }
    #[test]
    fn trivial_exit_clause_stops_parsing() {
        // Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
        // Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
        let w0 = 0u32; // clause A body
        let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
        let w2 = 0u32;
        let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
        assert!(!p.cf.is_empty());
        // Exit detected → remaining dword is instruction data.
        assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
    }
 }
--- a/crates/xenia-gpu/src/xenos_constants.rs
+++ b/crates/xenia-gpu/src/xenos_constants.rs
@@ -0,0 +1,124 @@
 //! The "Xenos constants" block the WGSL interpreter consumes per draw.
 //!
 //! Mirrors the Xenos register-file regions that carry the per-draw constant
 //! values shaders reference at runtime:
 //!
 //! | Region | Base | Count | Size |
 //! |--------|------|-------|------|
 //! | ALU   | 0x4000 | 512 × vec4<f32> | 8 KB |
 //! | Fetch | 0x4800 | 256 × u32       | 1 KB |
 //! | Bool  | 0x4900 |  8 × u32        | 32 B |
 //! | Loop  | 0x4908 | 32 × u32        | 128 B |
 //!
 //! Total: ~9.2 KB, well under the 64 KB min uniform buffer size on all wgpu
 //! backends. The `XenosConstantsBlock` is declared `#[repr(C)]` + bytemuck
 //! `Pod` so it can be `bytemuck::bytes_of()`'d directly into a wgpu uniform
 //! buffer. The matching WGSL `struct XenosConstants` lives in
 //! `shaders/xenos_interp.wgsl`.
 use bytemuck::{Pod, Zeroable};
 use crate::register_file::RegisterFile;
 pub const ALU_CONSTANT_COUNT: usize = 512;
 pub const FETCH_CONSTANT_COUNT: usize = 256;
 pub const BOOL_CONSTANT_COUNT: usize = 8;
 pub const LOOP_CONSTANT_COUNT: usize = 32;
 pub const CONST_BASE_ALU: u32 = 0x4000;
 pub const CONST_BASE_FETCH: u32 = 0x4800;
 pub const CONST_BASE_BOOL: u32 = 0x4900;
 pub const CONST_BASE_LOOP: u32 = 0x4908;
 /// Per-draw constants block uploaded once to the uniform buffer at
 /// `@group(0) @binding(1)`.
 #[repr(C)]
 #[derive(Clone, Copy)]
 pub struct XenosConstantsBlock {
    pub alu: [[f32; 4]; ALU_CONSTANT_COUNT],
    pub fetch: [u32; FETCH_CONSTANT_COUNT],
    pub bool_consts: [u32; BOOL_CONSTANT_COUNT],
    pub loop_consts: [u32; LOOP_CONSTANT_COUNT],
 }
 // SAFETY: all fields are Pod arrays of Pod primitives; `#[repr(C)]` fixes
 // the layout. `bytemuck` derives `Pod` only when alignment + padding line
 // up, so manual `unsafe impl` is the right tool here.
 unsafe impl Zeroable for XenosConstantsBlock {}
 unsafe impl Pod for XenosConstantsBlock {}
 impl Default for XenosConstantsBlock {
    fn default() -> Self {
        Self {
            alu: [[0.0; 4]; ALU_CONSTANT_COUNT],
            fetch: [0; FETCH_CONSTANT_COUNT],
            bool_consts: [0; BOOL_CONSTANT_COUNT],
            loop_consts: [0; LOOP_CONSTANT_COUNT],
        }
    }
 }
 impl XenosConstantsBlock {
    /// Size in bytes — exposed for tests + wgpu buffer sizing.
    pub const SIZE: usize = std::mem::size_of::<Self>();
    /// Snapshot the constants from a Xenos `RegisterFile` into a dense,
    /// host-friendly layout the WGSL interpreter expects. ALU constants
    /// (vec4 each) are 4 consecutive registers; fetch constants are u32.
    pub fn snapshot(rf: &RegisterFile) -> Self {
        let mut out = Self::default();
        for i in 0..ALU_CONSTANT_COUNT {
            let base = CONST_BASE_ALU + (i as u32) * 4;
            out.alu[i] = [
                f32::from_bits(rf.read(base)),
                f32::from_bits(rf.read(base + 1)),
                f32::from_bits(rf.read(base + 2)),
                f32::from_bits(rf.read(base + 3)),
            ];
        }
        for i in 0..FETCH_CONSTANT_COUNT {
            out.fetch[i] = rf.read(CONST_BASE_FETCH + i as u32);
        }
        for i in 0..BOOL_CONSTANT_COUNT {
            out.bool_consts[i] = rf.read(CONST_BASE_BOOL + i as u32);
        }
        for i in 0..LOOP_CONSTANT_COUNT {
            out.loop_consts[i] = rf.read(CONST_BASE_LOOP + i as u32);
        }
        out
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    /// Layout-sanity: total size is (512·16) + (256·4) + (8·4) + (32·4) =
    /// 8192 + 1024 + 32 + 128 = 9376 bytes. If this number drifts, either
    /// the constant counts changed or the compiler added padding; either
    /// way we want to know at test time because the WGSL struct layout in
    /// `xenos_interp.wgsl` depends on it.
    #[test]
    fn xenos_constants_block_size_is_stable() {
        assert_eq!(XenosConstantsBlock::SIZE, 9376);
    }
    #[test]
    fn snapshot_roundtrip_from_register_file() {
        let mut rf = RegisterFile::new();
        // Write a recognisable pattern to alu[0] = (1.0, 2.0, 3.0, 4.0)
        rf.write(CONST_BASE_ALU + 0, f32::to_bits(1.0));
        rf.write(CONST_BASE_ALU + 1, f32::to_bits(2.0));
        rf.write(CONST_BASE_ALU + 2, f32::to_bits(3.0));
        rf.write(CONST_BASE_ALU + 3, f32::to_bits(4.0));
        rf.write(CONST_BASE_FETCH + 5, 0xDEAD_BEEF);
        rf.write(CONST_BASE_BOOL, 0x1234);
        rf.write(CONST_BASE_LOOP + 3, 0x5678);
        let snap = XenosConstantsBlock::snapshot(&rf);
        assert_eq!(snap.alu[0], [1.0, 2.0, 3.0, 4.0]);
        assert_eq!(snap.fetch[5], 0xDEAD_BEEF);
        assert_eq!(snap.bool_consts[0], 0x1234);
        assert_eq!(snap.loop_consts[3], 0x5678);
    }
 }