xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view, ring_drain, pm4) drains the command processor; gpu_system owns the threaded backend (DrainFence RPC + parker/fence helpers from M1) and the MMIO-mapped register block (mmio_region). Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode the Xbox 360 microcode, translator.rs lowers it onto the WGSL xenos_interp interpreter shader (shaders/xenos_interp.wgsl). shader_metrics.rs counts decode/translate work. Render state: draw_state, primitive, render_target_cache, texture_cache, tiled_address (Xenos's swizzled tiled-memory layout), xenos_constants (register field constants), edram (the 10 MiB EDRAM model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs owns the typed GPU-resource handles the kernel hands out. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions
--- a/crates/xenia-gpu/src/texture_cache.rs
+++ b/crates/xenia-gpu/src/texture_cache.rs
@@ -0,0 +1,970 @@
+//! Texture cache — P5.
+//!
+//! Two-layer design mirroring canary's `TextureCache`:
+//!
+//!  * **CPU layer** (this module): owns decoded, linear, host-endian texel
+//!    byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the
+//!    guest memory's page-version counter to decide whether the cached
+//!    bytes are still fresh and re-decodes on miss or staleness.
+//!  * **GPU layer** (xenia-ui `texture_cache_host`): owns the
+//!    `wgpu::Texture` + `TextureView` for each cached key; pulls decoded
+//!    bytes from this CPU layer on upload.
+//!
+//! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and
+//! `texture_info_formats.inl` for the format table.
+
+use std::collections::HashMap;
+
+use crate::tiled_address;
+
+/// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`.
+/// Values are the raw enum numbers the guest writes into
+/// `xe_gpu_texture_fetch_t.format`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[repr(u8)]
+pub enum TextureFormat {
+    K1Reverse = 0,
+    K1 = 1,
+    K8 = 2,
+    K1555 = 3,
+    K565 = 4,
+    K6_5_5 = 5,
+    K8888 = 6,
+    K1010102 = 7,
+    K8_8 = 10,
+    K4_4_4_4 = 15,
+    K10_11_11 = 16,
+    K11_11_10 = 17,
+    Dxt1 = 18,
+    Dxt2_3 = 19,
+    Dxt4_5 = 20,
+    K24_8 = 22,
+    K24_8Float = 23,
+    K16 = 24,
+    K16_16 = 25,
+    K16_16_16_16 = 26,
+    K16Float = 30,
+    K16_16Float = 31,
+    K16_16_16_16Float = 32,
+    K32 = 33,
+    K32_32 = 34,
+    K32_32_32_32 = 35,
+    K32Float = 36,
+    K32_32Float = 37,
+    K32_32_32_32Float = 38,
+    Unknown(u8),
+}
+
+impl TextureFormat {
+    pub fn from_raw(v: u8) -> Self {
+        use TextureFormat::*;
+        match v & 0x3F {
+            0 => K1Reverse,
+            1 => K1,
+            2 => K8,
+            3 => K1555,
+            4 => K565,
+            5 => K6_5_5,
+            6 => K8888,
+            7 => K1010102,
+            10 => K8_8,
+            15 => K4_4_4_4,
+            16 => K10_11_11,
+            17 => K11_11_10,
+            18 => Dxt1,
+            19 => Dxt2_3,
+            20 => Dxt4_5,
+            22 => K24_8,
+            23 => K24_8Float,
+            24 => K16,
+            25 => K16_16,
+            26 => K16_16_16_16,
+            30 => K16Float,
+            31 => K16_16Float,
+            32 => K16_16_16_16Float,
+            33 => K32,
+            34 => K32_32,
+            35 => K32_32_32_32,
+            36 => K32Float,
+            37 => K32_32Float,
+            38 => K32_32_32_32Float,
+            other => Unknown(other),
+        }
+    }
+
+    /// Block width/height in texels + bytes-per-block. For uncompressed
+    /// formats block_w = block_h = 1. For DXT formats block_w = block_h =
+    /// 4 (one 4×4 compressed block).
+    pub fn block_info(self) -> BlockInfo {
+        use TextureFormat::*;
+        match self {
+            K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte
+            K8 => BlockInfo::new(1, 1, 1),
+            K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2),
+            K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16
+            | K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4),
+            K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8),
+            K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16),
+            Dxt1 => BlockInfo::new(4, 4, 8),
+            Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16),
+            Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback
+        }
+    }
+
+    /// True iff this format lands on a wgpu texture format we can
+    /// natively bind — no CPU-side conversion per frame required. M5
+    /// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still
+    /// counts as supported for the host-cache wiring), `k_DXT2_3`
+    /// (BC2), and `k_DXT4_5` (BC3).
+    pub fn is_host_supported(self) -> bool {
+        matches!(
+            self,
+            TextureFormat::K8888
+                | TextureFormat::K565
+                | TextureFormat::Dxt1
+                | TextureFormat::Dxt2_3
+                | TextureFormat::Dxt4_5
+        )
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct BlockInfo {
+    pub block_w: u8,
+    pub block_h: u8,
+    pub bytes_per_block: u8,
+}
+
+impl BlockInfo {
+    pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self {
+        Self {
+            block_w,
+            block_h,
+            bytes_per_block,
+        }
+    }
+    pub fn log2_bpb(self) -> u32 {
+        match self.bytes_per_block {
+            1 => 0,
+            2 => 1,
+            4 => 2,
+            8 => 3,
+            16 => 4,
+            _ => 0,
+        }
+    }
+}
+
+/// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum Endian {
+    None = 0,
+    Swap8In16 = 1,
+    Swap8In32 = 2,
+    Swap16In32 = 3,
+}
+
+impl Endian {
+    pub fn from_raw(v: u8) -> Self {
+        match v & 0x3 {
+            1 => Endian::Swap8In16,
+            2 => Endian::Swap8In32,
+            3 => Endian::Swap16In32,
+            _ => Endian::None,
+        }
+    }
+
+    /// Apply this endian's byte swap to one 32-bit unit. Matches canary's
+    /// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls
+    /// the same mask-shift pattern.
+    pub fn swap32(self, v: u32) -> u32 {
+        match self {
+            Endian::None => v,
+            Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8),
+            Endian::Swap8In32 => v.swap_bytes(),
+            Endian::Swap16In32 => v.rotate_right(16),
+        }
+    }
+}
+
+/// Texture dimensionality (`xenos::DataDimension`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum Dimension {
+    D1 = 0,
+    D2 = 1,
+    D3Stacked = 2,
+    Cube = 3,
+}
+
+impl Dimension {
+    pub fn from_raw(v: u8) -> Self {
+        match v & 0x3 {
+            1 => Dimension::D2,
+            2 => Dimension::D3Stacked,
+            3 => Dimension::Cube,
+            _ => Dimension::D1,
+        }
+    }
+}
+
+/// Identity of a cached texture. Matches canary's `TextureCache::TextureKey`
+/// at the semantic level — we exclude mip/border state for P5 since neither
+/// is populated yet.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TextureKey {
+    /// Guest physical base (byte address — already shifted left by 12 from
+    /// the fetch-constant `base_address` field).
+    pub base_address: u32,
+    pub width: u16,
+    pub height: u16,
+    pub depth_or_slices: u16,
+    pub format: TextureFormat,
+    pub endian: Endian,
+    pub dimension: Dimension,
+    pub tiled: bool,
+    /// Row pitch in texels, already aligned to 32. Canary stores pitch/32
+    /// in the fetch constant; we keep the raw texel count to avoid
+    /// callers remembering to shift.
+    pub pitch_texels: u16,
+}
+
+/// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`).
+/// Returns `None` if the constant is obviously unset (all zeros) or if
+/// `type` is not the texture-constant marker.
+pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option<TextureKey> {
+    let d0 = dwords[0];
+    let d1 = dwords[1];
+    let d2 = dwords[2];
+    let d5 = dwords[5];
+
+    // type: low 2 bits of dword 0 should be 2 (texture) per canary —
+    // 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so
+    // `None` filters it out here.
+    let ty = d0 & 0x3;
+    if d0 == 0 && d1 == 0 {
+        return None;
+    }
+    // Not a texture constant (e.g. 0 = vertex fetch constant reused).
+    if ty != 2 {
+        return None;
+    }
+
+    let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels
+    let tiled = ((d0 >> 31) & 1) != 0;
+    let format = TextureFormat::from_raw((d1 & 0x3F) as u8);
+    let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8);
+    let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted.
+    let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8);
+
+    // Size decode depends on dimension.
+    let (width, height, depth) = match dim {
+        Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16),
+        Dimension::D2 => (
+            (d2 & 0x1FFF) as u16 + 1,
+            ((d2 >> 13) & 0x1FFF) as u16 + 1,
+            ((d2 >> 26) & 0x3F) as u16 + 1,
+        ),
+        Dimension::D3Stacked | Dimension::Cube => (
+            (d2 & 0x7FF) as u16 + 1,
+            ((d2 >> 11) & 0x7FF) as u16 + 1,
+            ((d2 >> 22) & 0x3FF) as u16 + 1,
+        ),
+    };
+
+    Some(TextureKey {
+        base_address,
+        width,
+        height,
+        depth_or_slices: depth,
+        format,
+        endian,
+        dimension: dim,
+        tiled,
+        pitch_texels: ((pitch_5 as u16) * 32).max(width),
+    })
+}
+
+/// Decoded, linear, host-endian texture bytes ready for wgpu upload.
+#[derive(Debug, Clone)]
+pub struct CachedTexture {
+    pub key: TextureKey,
+    pub version_when_uploaded: u64,
+    /// Tightly packed. Layout depends on `key.format`:
+    /// - `K8888` → `width*height*4` bytes in Rgba8Unorm order.
+    /// - `Dxt1`  → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after
+    ///   block-level detile + dword-endian swap.
+    pub bytes: Vec<u8>,
+}
+
+impl CachedTexture {
+    pub fn byte_size(&self) -> usize {
+        self.bytes.len()
+    }
+}
+
+/// Errors that can happen during decode. The `ensure_cached` caller maps
+/// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when
+/// a texture fell back.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DecodeError {
+    UnsupportedFormat,
+    OutOfBounds,
+    ZeroSize,
+}
+
+/// Read `len` bytes from guest memory starting at `addr`. Returns `None`
+/// if the span would exceed the memory's reported end; otherwise returns
+/// a freshly-allocated buffer with the bytes.
+///
+/// The `MemoryAccess` trait exposes per-byte reads only; we batch them in
+/// a single pass to avoid the per-byte virtual dispatch overhead for large
+/// textures (1 MiB frontbuffer = 1M dispatch calls).
+pub fn read_guest_bytes(
+    mem: &dyn xenia_memory::MemoryAccess,
+    addr: u32,
+    len: usize,
+) -> Vec<u8> {
+    let mut out = Vec::with_capacity(len);
+    for i in 0..len {
+        let a = addr.wrapping_add(i as u32);
+        out.push(mem.read_u8(a));
+        if a < addr {
+            // 32-bit overflow; unmap the tail.
+            break;
+        }
+    }
+    out
+}
+
+/// Byte-swap the 32-bit dwords of `buf` in place according to `endian`.
+/// `buf.len()` should be a multiple of 4; tail bytes are left untouched.
+pub fn apply_endian_32(buf: &mut [u8], endian: Endian) {
+    if matches!(endian, Endian::None) {
+        return;
+    }
+    let mut i = 0;
+    while i + 4 <= buf.len() {
+        let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]);
+        let swapped = endian.swap32(v);
+        buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes());
+        i += 4;
+    }
+}
+
+/// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes.
+/// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory →
+/// we emit RGBA for wgpu) and the declared endian swap, then detiles via
+/// the Xenos Tiled2D formula.
+pub fn decode_k8888_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    if key.width == 0 || key.height == 0 {
+        return Err(DecodeError::ZeroSize);
+    }
+    let w = key.width as u32;
+    let h = key.height as u32;
+    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
+    let total_bytes = (pitch_aligned * h * 4) as usize;
+    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
+    if raw.len() < total_bytes {
+        return Err(DecodeError::OutOfBounds);
+    }
+    apply_endian_32(&mut raw, key.endian);
+    let mut linear = vec![0u8; (w * h * 4) as usize];
+    if key.tiled {
+        if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() {
+            return Err(DecodeError::OutOfBounds);
+        }
+    } else {
+        // Non-tiled copy row-by-row honoring pitch.
+        for y in 0..h as usize {
+            let src = y * (pitch_aligned as usize) * 4;
+            let dst = y * (w as usize) * 4;
+            linear[dst..dst + (w as usize) * 4]
+                .copy_from_slice(&raw[src..src + (w as usize) * 4]);
+        }
+    }
+    // Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After
+    // endian.Swap8In32 guests' typical per-dword byte order becomes BGRA
+    // in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu.
+    for px in linear.chunks_exact_mut(4) {
+        px.swap(0, 2);
+    }
+    Ok(linear)
+}
+
+/// Decode a DXT-compressed texture to raw block bytes (no format
+/// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the
+/// GPU does the actual decompression on upload).
+///
+/// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D
+/// formula, with stride counted in blocks. `bytes_per_block` is 8 for
+/// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5).
+pub fn decode_dxt_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+    bytes_per_block: u32,
+) -> Result<Vec<u8>, DecodeError> {
+    if key.width == 0 || key.height == 0 {
+        return Err(DecodeError::ZeroSize);
+    }
+    let block_w = 4u32;
+    let block_h = 4u32;
+    let w_blocks = (key.width as u32).div_ceil(block_w);
+    let h_blocks = (key.height as u32).div_ceil(block_h);
+    let pitch_blocks = tiled_address::align_pitch_to_macro_tile(
+        (key.pitch_texels as u32).div_ceil(block_w),
+    );
+    let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize;
+    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
+    if raw.len() < total_bytes {
+        return Err(DecodeError::OutOfBounds);
+    }
+    // DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar
+    // u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word
+    // order requires an endian swap at the u16/u32 level regardless of
+    // which BC-family format.
+    apply_endian_32(&mut raw, key.endian);
+
+    let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize];
+    if key.tiled {
+        if tiled_address::detile_2d(
+            &raw,
+            &mut out,
+            w_blocks,
+            h_blocks,
+            pitch_blocks,
+            bytes_per_block,
+        )
+        .is_err()
+        {
+            return Err(DecodeError::OutOfBounds);
+        }
+    } else {
+        for y in 0..h_blocks as usize {
+            let src = y * (pitch_blocks as usize) * (bytes_per_block as usize);
+            let dst = y * (w_blocks as usize) * (bytes_per_block as usize);
+            out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)]
+                .copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]);
+        }
+    }
+    Ok(out)
+}
+
+/// BC1 / DXT1 — 8-byte blocks.
+pub fn decode_dxt1_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    decode_dxt_tiled(key, mem, 8)
+}
+
+/// BC2 / DXT2_3 — 16-byte blocks.
+pub fn decode_dxt23_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    decode_dxt_tiled(key, mem, 16)
+}
+
+/// BC3 / DXT4_5 — 16-byte blocks.
+pub fn decode_dxt45_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    decode_dxt_tiled(key, mem, 16)
+}
+
+/// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high
+/// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of
+/// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a
+/// sampled texture format on every backend, so CPU-side conversion is
+/// the safe path even if it's 2× the texture memory.
+///
+/// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes),
+/// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear
+/// output buffer.
+pub fn decode_k565_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    if key.width == 0 || key.height == 0 {
+        return Err(DecodeError::ZeroSize);
+    }
+    let w = key.width as u32;
+    let h = key.height as u32;
+    // Pitch/block counts — block = 1 texel here, 2 bytes.
+    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
+    let total_bytes = (pitch_aligned * h * 2) as usize;
+    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
+    if raw.len() < total_bytes {
+        return Err(DecodeError::OutOfBounds);
+    }
+    // 16-bit word order is endian-swap-sensitive.
+    apply_endian_32(&mut raw, key.endian);
+    // Step 1: detile (bytes_per_block=2, tile in blocks=texels).
+    let mut linear_u16 = vec![0u8; (w * h * 2) as usize];
+    if key.tiled {
+        if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() {
+            return Err(DecodeError::OutOfBounds);
+        }
+    } else {
+        for y in 0..h as usize {
+            let src = y * (pitch_aligned as usize) * 2;
+            let dst = y * (w as usize) * 2;
+            linear_u16[dst..dst + (w as usize) * 2]
+                .copy_from_slice(&raw[src..src + (w as usize) * 2]);
+        }
+    }
+    // Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16
+    // is little-endian after `apply_endian_32` has normalized the word
+    // order (we keep host-native byte ordering post-swap).
+    let mut rgba = vec![0u8; (w * h * 4) as usize];
+    for y in 0..h as usize {
+        for x in 0..w as usize {
+            let off = (y * w as usize + x) * 2;
+            let lo = linear_u16[off];
+            let hi = linear_u16[off + 1];
+            let word = u16::from_le_bytes([lo, hi]);
+            // 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4).
+            // Expand to full-range u8: replicate high bits into low
+            // (so 0b11111 → 0xFF, matching the standard 565→888 convention).
+            let r5 = ((word >> 11) & 0x1F) as u8;
+            let g6 = ((word >> 5) & 0x3F) as u8;
+            let b5 = (word & 0x1F) as u8;
+            let r = (r5 << 3) | (r5 >> 2);
+            let g = (g6 << 2) | (g6 >> 4);
+            let b = (b5 << 3) | (b5 >> 2);
+            let o = (y * w as usize + x) * 4;
+            rgba[o] = r;
+            rgba[o + 1] = g;
+            rgba[o + 2] = b;
+            rgba[o + 3] = 0xFF;
+        }
+    }
+    Ok(rgba)
+}
+
+/// Version-aware CPU-side texture cache. Entries are keyed on
+/// `TextureKey.hash` and carry a `version_when_uploaded` watermark against
+/// the guest memory's page-version counter. `ensure_cached` queries
+/// `GuestMemory::max_page_version` over the texture's byte span; if the
+/// span has been written since cache time, the entry is re-decoded.
+pub struct TextureCache {
+    entries: HashMap<TextureKey, CachedTexture>,
+    /// Monotonic counter of decodes performed — HUD surface.
+    pub decodes_total: u64,
+    /// Count of stale-miss re-decodes.
+    pub restale_total: u64,
+}
+
+impl Default for TextureCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TextureCache {
+    pub fn new() -> Self {
+        Self {
+            entries: HashMap::new(),
+            decodes_total: 0,
+            restale_total: 0,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.entries.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.entries.is_empty()
+    }
+
+    pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> {
+        self.entries.get(key)
+    }
+
+    /// Return a cached (or freshly-decoded) texture. The caller supplies
+    /// the current guest-memory page version covering the texture span;
+    /// see [`max_page_version_for`].
+    pub fn ensure_cached(
+        &mut self,
+        key: TextureKey,
+        current_version: u64,
+        mem: &dyn xenia_memory::MemoryAccess,
+    ) -> Result<&CachedTexture, DecodeError> {
+        // Fast path: fresh entry exists.
+        if let Some(e) = self.entries.get(&key) {
+            if e.version_when_uploaded >= current_version {
+                return Ok(self.entries.get(&key).unwrap());
+            }
+            self.restale_total += 1;
+        }
+        let bytes = match key.format {
+            TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?,
+            TextureFormat::K565 => decode_k565_tiled(&key, mem)?,
+            TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?,
+            TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?,
+            TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?,
+            _ => return Err(DecodeError::UnsupportedFormat),
+        };
+        self.decodes_total += 1;
+        let entry = CachedTexture {
+            key,
+            version_when_uploaded: current_version,
+            bytes,
+        };
+        self.entries.insert(key, entry);
+        Ok(self.entries.get(&key).unwrap())
+    }
+
+    pub fn byte_budget(&self) -> usize {
+        self.entries.values().map(|e| e.byte_size()).sum()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::cell::Cell;
+
+    struct FakeMem(Box<[Cell<u8>]>);
+    impl FakeMem {
+        fn from_vec(v: Vec<u8>) -> Self {
+            FakeMem(v.into_iter().map(Cell::new).collect())
+        }
+    }
+    impl xenia_memory::MemoryAccess for FakeMem {
+        fn read_u8(&self, a: u32) -> u8 {
+            self.0.get(a as usize).map(|c| c.get()).unwrap_or(0)
+        }
+        fn read_u16(&self, a: u32) -> u16 {
+            u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)])
+        }
+        fn read_u32(&self, a: u32) -> u32 {
+            u32::from_be_bytes([
+                self.read_u8(a),
+                self.read_u8(a + 1),
+                self.read_u8(a + 2),
+                self.read_u8(a + 3),
+            ])
+        }
+        fn read_u64(&self, a: u32) -> u64 {
+            u64::from_be_bytes([
+                self.read_u8(a),
+                self.read_u8(a + 1),
+                self.read_u8(a + 2),
+                self.read_u8(a + 3),
+                self.read_u8(a + 4),
+                self.read_u8(a + 5),
+                self.read_u8(a + 6),
+                self.read_u8(a + 7),
+            ])
+        }
+        fn write_u8(&self, a: u32, v: u8) {
+            if let Some(slot) = self.0.get(a as usize) {
+                slot.set(v);
+            }
+        }
+        fn write_u16(&self, a: u32, v: u16) {
+            let b = v.to_be_bytes();
+            self.write_u8(a, b[0]);
+            self.write_u8(a + 1, b[1]);
+        }
+        fn write_u32(&self, a: u32, v: u32) {
+            let b = v.to_be_bytes();
+            for i in 0..4 {
+                self.write_u8(a + i as u32, b[i]);
+            }
+        }
+        fn write_u64(&self, a: u32, v: u64) {
+            let b = v.to_be_bytes();
+            for i in 0..8 {
+                self.write_u8(a + i as u32, b[i]);
+            }
+        }
+        fn translate(&self, _: u32) -> Option<*const u8> {
+            None
+        }
+        fn translate_mut(&self, _: u32) -> Option<*mut u8> {
+            None
+        }
+    }
+
+    #[test]
+    fn format_block_info_matches_canary_expectations() {
+        assert_eq!(
+            TextureFormat::K8888.block_info(),
+            BlockInfo::new(1, 1, 4)
+        );
+        assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8));
+        assert_eq!(
+            TextureFormat::Dxt4_5.block_info(),
+            BlockInfo::new(4, 4, 16)
+        );
+    }
+
+    #[test]
+    fn endian_swap_variants() {
+        assert_eq!(Endian::None.swap32(0x11223344), 0x11223344);
+        assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433);
+        assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211);
+        assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122);
+    }
+
+    #[test]
+    fn decode_fetch_constant_rejects_empty() {
+        let z = [0u32; 6];
+        assert!(decode_fetch_constant(z).is_none());
+    }
+
+    #[test]
+    fn decode_fetch_constant_parses_2d_k8888() {
+        // Build a synthetic k_8_8_8_8 2D texture fetch constant:
+        //   dword0: pitch_5=40 (1280/32), tiled=1, type=2
+        //   dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12
+        //   dword2: width-1=1279, height-1=719
+        //   dword5: dimension=1 (2D)
+        let d0 = 0x8000_0000 | (40u32 << 22) | 2;
+        let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32;
+        let d2 = 1279u32 | ((719u32) << 13);
+        let d5 = 1u32 << 9;
+        let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed");
+        assert_eq!(k.format, TextureFormat::K8888);
+        assert_eq!(k.endian, Endian::Swap8In32);
+        assert_eq!(k.width, 1280);
+        assert_eq!(k.height, 720);
+        assert_eq!(k.dimension, Dimension::D2);
+        assert!(k.tiled);
+        assert_eq!(k.pitch_texels, 1280);
+    }
+
+    #[test]
+    fn decode_k8888_roundtrip_linear() {
+        // Build a 4×4 non-tiled image with pitch=32 (one macro-tile row).
+        // Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a
+        // big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must
+        // be (x, y, y*4+x, 0xFF) in RGBA order.
+        let w = 4u32;
+        let h = 4u32;
+        let pitch = 32u32;
+        let mut bytes = vec![0u8; (pitch * h * 4) as usize];
+        for y in 0..h {
+            for x in 0..w {
+                let off = ((y * pitch + x) * 4) as usize;
+                let argb = (0xFFu32 << 24)
+                    | ((x as u32) << 16)
+                    | ((y as u32) << 8)
+                    | ((y * 4 + x) as u32);
+                bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes());
+            }
+        }
+        let mem = FakeMem::from_vec(bytes);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::K8888,
+            endian: Endian::Swap8In32,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: pitch as u16,
+        };
+        let out = decode_k8888_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 16 * 4);
+        assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
+        let off = ((3 * 4 + 3) * 4) as usize;
+        assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]);
+    }
+
+    // ── First-Pixels M5 format tests ──────────────────────────────
+
+    /// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block.
+    /// Synthetic source of 0xDEADBEEF... bytes; assert the decoder
+    /// returns the same bytes (passthrough after endian swap).
+    #[test]
+    fn decode_dxt23_small_roundtrip() {
+        // 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32
+        // (macro-tile-aligned) the block pitch is 8 (=32/4), and we
+        // allocate 8*1*16 = 128 bytes of source.
+        let mut bytes = vec![0u8; 128];
+        for (i, b) in bytes.iter_mut().enumerate().take(16) {
+            *b = i as u8;
+        }
+        let mem = FakeMem::from_vec(bytes);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::Dxt2_3,
+            endian: Endian::None, // no swap — we can eyeball passthrough
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        let out = decode_dxt23_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 16); // 1 block × 16 bytes
+        for i in 0..16 {
+            assert_eq!(out[i], i as u8);
+        }
+    }
+
+    /// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a
+    /// parallel test prevents a regression that sneaks up via the
+    /// generic `decode_dxt_tiled`.
+    #[test]
+    fn decode_dxt45_uses_16byte_blocks() {
+        let mem = FakeMem::from_vec(vec![0xAAu8; 256]);
+        let key = TextureKey {
+            base_address: 0,
+            width: 8,
+            height: 4, // 2×1 blocks
+            depth_or_slices: 1,
+            format: TextureFormat::Dxt4_5,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        let out = decode_dxt45_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 2 * 16);
+    }
+
+    /// k_5_6_5: a single white texel (all bits set, 0xFFFF) should
+    /// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red
+    /// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0
+    /// B=0 via the high-bit-replicate convention.
+    #[test]
+    fn decode_k565_texel_expansion() {
+        // Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels
+        // → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white),
+        // texel[1] = 0xF800 (pure red).
+        let mut bytes = vec![0u8; 64];
+        // 0xFFFF
+        bytes[0] = 0xFF;
+        bytes[1] = 0xFF;
+        // 0xF800 (big-endian memory): high byte 0xF8, low 0x00.
+        // But after apply_endian_32(Endian::None) we use little-endian
+        // word decoding — so memory must carry the bytes in LE order.
+        bytes[2] = 0x00;
+        bytes[3] = 0xF8;
+        let mem = FakeMem::from_vec(bytes);
+        let key = TextureKey {
+            base_address: 0,
+            width: 2,
+            height: 1,
+            depth_or_slices: 1,
+            format: TextureFormat::K565,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        let out = decode_k565_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 2 * 4);
+        // Texel 0: white.
+        assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]);
+        // Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF).
+        assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]);
+    }
+
+    #[test]
+    fn is_host_supported_covers_m5_formats() {
+        assert!(TextureFormat::K8888.is_host_supported());
+        assert!(TextureFormat::K565.is_host_supported());
+        assert!(TextureFormat::Dxt1.is_host_supported());
+        assert!(TextureFormat::Dxt2_3.is_host_supported());
+        assert!(TextureFormat::Dxt4_5.is_host_supported());
+        // Unsupported formats should still report false.
+        assert!(!TextureFormat::K16.is_host_supported());
+        assert!(!TextureFormat::K32Float.is_host_supported());
+    }
+
+    #[test]
+    fn texture_cache_caches_and_reuses() {
+        let mut cache = TextureCache::new();
+        let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::K8888,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        cache.ensure_cached(key, 0, &mem).unwrap();
+        assert_eq!(cache.decodes_total, 1);
+        // Same version: should hit cache.
+        cache.ensure_cached(key, 0, &mem).unwrap();
+        assert_eq!(cache.decodes_total, 1);
+        // Higher version: stale → re-decode.
+        cache.ensure_cached(key, 1, &mem).unwrap();
+        assert_eq!(cache.decodes_total, 2);
+        assert_eq!(cache.restale_total, 1);
+    }
+
+    /// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey`
+    /// → `ensure_cached` on fresh/version-bumped memory → stale re-decode.
+    /// Mirrors what `vd_swap` does per frame.
+    #[test]
+    fn e2e_fetch_const_to_cache_with_versioning() {
+        // 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned.
+        let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2
+        let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none
+        let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3
+        let d5 = 1u32 << 9; // 2D
+        let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded");
+        assert_eq!(key.format, TextureFormat::K8888);
+        assert_eq!(key.width, 4);
+
+        let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]);
+        let mut cache = TextureCache::new();
+        // v0 decode.
+        let first = cache
+            .ensure_cached(key, 0, &mem)
+            .expect("initial decode")
+            .clone();
+        // Same version → cache hit.
+        cache.ensure_cached(key, 0, &mem).expect("hit");
+        assert_eq!(cache.decodes_total, 1);
+        // Simulate the guest writing to the texture's pages: version bumps.
+        for b in &mem.0[..16] {
+            b.set(0xFF);
+        }
+        cache.ensure_cached(key, 1, &mem).expect("re-decode");
+        assert_eq!(cache.decodes_total, 2);
+        assert_eq!(cache.restale_total, 1);
+        // Bytes differ from v0 (proof the re-decode happened).
+        let second = cache.get(&key).unwrap();
+        assert_ne!(first.bytes, second.bytes);
+    }
+
+    #[test]
+    fn texture_cache_rejects_unsupported_format() {
+        let mut cache = TextureCache::new();
+        let mem = FakeMem::from_vec(vec![0u8; 1024]);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::K16,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        assert!(matches!(
+            cache.ensure_cached(key, 0, &mem),
+            Err(DecodeError::UnsupportedFormat)
+        ));
+    }
+}