xenia-rs/crates/xenia-gpu/src/texture_cache.rs

//! Texture cache — P5.
//!
//! Two-layer design mirroring canary's `TextureCache`:
//!
//!  * **CPU layer** (this module): owns decoded, linear, host-endian texel
//!    byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the
//!    guest memory's page-version counter to decide whether the cached
//!    bytes are still fresh and re-decodes on miss or staleness.
//!  * **GPU layer** (xenia-ui `texture_cache_host`): owns the
//!    `wgpu::Texture` + `TextureView` for each cached key; pulls decoded
//!    bytes from this CPU layer on upload.
//!
//! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and
//! `texture_info_formats.inl` for the format table.

use std::collections::HashMap;

use crate::tiled_address;

/// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`.
/// Values are the raw enum numbers the guest writes into
/// `xe_gpu_texture_fetch_t.format`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u8)]
pub enum TextureFormat {
    K1Reverse = 0,
    K1 = 1,
    K8 = 2,
    K1555 = 3,
    K565 = 4,
    K6_5_5 = 5,
    K8888 = 6,
    K1010102 = 7,
    K8_8 = 10,
    K4_4_4_4 = 15,
    K10_11_11 = 16,
    K11_11_10 = 17,
    Dxt1 = 18,
    Dxt2_3 = 19,
    Dxt4_5 = 20,
    K24_8 = 22,
    K24_8Float = 23,
    K16 = 24,
    K16_16 = 25,
    K16_16_16_16 = 26,
    K16Float = 30,
    K16_16Float = 31,
    K16_16_16_16Float = 32,
    K32 = 33,
    K32_32 = 34,
    K32_32_32_32 = 35,
    K32Float = 36,
    K32_32Float = 37,
    K32_32_32_32Float = 38,
    Unknown(u8),
}

impl TextureFormat {
    pub fn from_raw(v: u8) -> Self {
        use TextureFormat::*;
        match v & 0x3F {
            0 => K1Reverse,
            1 => K1,
            2 => K8,
            3 => K1555,
            4 => K565,
            5 => K6_5_5,
            6 => K8888,
            7 => K1010102,
            10 => K8_8,
            15 => K4_4_4_4,
            16 => K10_11_11,
            17 => K11_11_10,
            18 => Dxt1,
            19 => Dxt2_3,
            20 => Dxt4_5,
            22 => K24_8,
            23 => K24_8Float,
            24 => K16,
            25 => K16_16,
            26 => K16_16_16_16,
            30 => K16Float,
            31 => K16_16Float,
            32 => K16_16_16_16Float,
            33 => K32,
            34 => K32_32,
            35 => K32_32_32_32,
            36 => K32Float,
            37 => K32_32Float,
            38 => K32_32_32_32Float,
            other => Unknown(other),
        }
    }

    /// Block width/height in texels + bytes-per-block. For uncompressed
    /// formats block_w = block_h = 1. For DXT formats block_w = block_h =
    /// 4 (one 4×4 compressed block).
    pub fn block_info(self) -> BlockInfo {
        use TextureFormat::*;
        match self {
            K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte
            K8 => BlockInfo::new(1, 1, 1),
            K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2),
            K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16
            | K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4),
            K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8),
            K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16),
            Dxt1 => BlockInfo::new(4, 4, 8),
            Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16),
            Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback
        }
    }

    /// True iff this format lands on a wgpu texture format we can
    /// natively bind — no CPU-side conversion per frame required. M5
    /// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still
    /// counts as supported for the host-cache wiring), `k_DXT2_3`
    /// (BC2), and `k_DXT4_5` (BC3).
    pub fn is_host_supported(self) -> bool {
        matches!(
            self,
            TextureFormat::K8888
                | TextureFormat::K565
                | TextureFormat::Dxt1
                | TextureFormat::Dxt2_3
                | TextureFormat::Dxt4_5
        )
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct BlockInfo {
    pub block_w: u8,
    pub block_h: u8,
    pub bytes_per_block: u8,
}

impl BlockInfo {
    pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self {
        Self {
            block_w,
            block_h,
            bytes_per_block,
        }
    }
    pub fn log2_bpb(self) -> u32 {
        match self.bytes_per_block {
            1 => 0,
            2 => 1,
            4 => 2,
            8 => 3,
            16 => 4,
            _ => 0,
        }
    }
}

/// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Endian {
    None = 0,
    Swap8In16 = 1,
    Swap8In32 = 2,
    Swap16In32 = 3,
}

impl Endian {
    pub fn from_raw(v: u8) -> Self {
        match v & 0x3 {
            1 => Endian::Swap8In16,
            2 => Endian::Swap8In32,
            3 => Endian::Swap16In32,
            _ => Endian::None,
        }
    }

    /// Apply this endian's byte swap to one 32-bit unit. Matches canary's
    /// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls
    /// the same mask-shift pattern.
    pub fn swap32(self, v: u32) -> u32 {
        match self {
            Endian::None => v,
            Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8),
            Endian::Swap8In32 => v.swap_bytes(),
            Endian::Swap16In32 => v.rotate_right(16),
        }
    }
}

/// Texture dimensionality (`xenos::DataDimension`).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Dimension {
    D1 = 0,
    D2 = 1,
    D3Stacked = 2,
    Cube = 3,
}

impl Dimension {
    pub fn from_raw(v: u8) -> Self {
        match v & 0x3 {
            1 => Dimension::D2,
            2 => Dimension::D3Stacked,
            3 => Dimension::Cube,
            _ => Dimension::D1,
        }
    }
}

/// Identity of a cached texture. Matches canary's `TextureCache::TextureKey`
/// at the semantic level — we exclude mip/border state for P5 since neither
/// is populated yet.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct TextureKey {
    /// Guest physical base (byte address — already shifted left by 12 from
    /// the fetch-constant `base_address` field).
    pub base_address: u32,
    pub width: u16,
    pub height: u16,
    pub depth_or_slices: u16,
    pub format: TextureFormat,
    pub endian: Endian,
    pub dimension: Dimension,
    pub tiled: bool,
    /// Row pitch in texels, already aligned to 32. Canary stores pitch/32
    /// in the fetch constant; we keep the raw texel count to avoid
    /// callers remembering to shift.
    pub pitch_texels: u16,
}

/// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`).
/// Returns `None` if the constant is obviously unset (all zeros) or if
/// `type` is not the texture-constant marker.
pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option<TextureKey> {
    let d0 = dwords[0];
    let d1 = dwords[1];
    let d2 = dwords[2];
    let d5 = dwords[5];

    // type: low 2 bits of dword 0 should be 2 (texture) per canary —
    // 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so
    // `None` filters it out here.
    let ty = d0 & 0x3;
    if d0 == 0 && d1 == 0 {
        return None;
    }
    // Not a texture constant (e.g. 0 = vertex fetch constant reused).
    if ty != 2 {
        return None;
    }

    let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels
    let tiled = ((d0 >> 31) & 1) != 0;
    let format = TextureFormat::from_raw((d1 & 0x3F) as u8);
    let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8);
    let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted.
    let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8);

    // Size decode depends on dimension.
    let (width, height, depth) = match dim {
        Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16),
        Dimension::D2 => (
            (d2 & 0x1FFF) as u16 + 1,
            ((d2 >> 13) & 0x1FFF) as u16 + 1,
            ((d2 >> 26) & 0x3F) as u16 + 1,
        ),
        Dimension::D3Stacked | Dimension::Cube => (
            (d2 & 0x7FF) as u16 + 1,
            ((d2 >> 11) & 0x7FF) as u16 + 1,
            ((d2 >> 22) & 0x3FF) as u16 + 1,
        ),
    };

    Some(TextureKey {
        base_address,
        width,
        height,
        depth_or_slices: depth,
        format,
        endian,
        dimension: dim,
        tiled,
        pitch_texels: ((pitch_5 as u16) * 32).max(width),
    })
}

/// Decoded, linear, host-endian texture bytes ready for wgpu upload.
#[derive(Debug, Clone)]
pub struct CachedTexture {
    pub key: TextureKey,
    pub version_when_uploaded: u64,
    /// Tightly packed. Layout depends on `key.format`:
    /// - `K8888` → `width*height*4` bytes in Rgba8Unorm order.
    /// - `Dxt1`  → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after
    ///   block-level detile + dword-endian swap.
    pub bytes: Vec<u8>,
}

impl CachedTexture {
    pub fn byte_size(&self) -> usize {
        self.bytes.len()
    }
}

/// Errors that can happen during decode. The `ensure_cached` caller maps
/// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when
/// a texture fell back.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DecodeError {
    UnsupportedFormat,
    OutOfBounds,
    ZeroSize,
}

/// Read `len` bytes from guest memory starting at `addr`. Returns `None`
/// if the span would exceed the memory's reported end; otherwise returns
/// a freshly-allocated buffer with the bytes.
///
/// The `MemoryAccess` trait exposes per-byte reads only; we batch them in
/// a single pass to avoid the per-byte virtual dispatch overhead for large
/// textures (1 MiB frontbuffer = 1M dispatch calls).
pub fn read_guest_bytes(
    mem: &dyn xenia_memory::MemoryAccess,
    addr: u32,
    len: usize,
) -> Vec<u8> {
    let mut out = Vec::with_capacity(len);
    for i in 0..len {
        let a = addr.wrapping_add(i as u32);
        out.push(mem.read_u8(a));
        if a < addr {
            // 32-bit overflow; unmap the tail.
            break;
        }
    }
    out
}

/// Byte-swap the 32-bit dwords of `buf` in place according to `endian`.
/// `buf.len()` should be a multiple of 4; tail bytes are left untouched.
pub fn apply_endian_32(buf: &mut [u8], endian: Endian) {
    if matches!(endian, Endian::None) {
        return;
    }
    let mut i = 0;
    while i + 4 <= buf.len() {
        let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]);
        let swapped = endian.swap32(v);
        buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes());
        i += 4;
    }
}

/// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes.
/// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory →
/// we emit RGBA for wgpu) and the declared endian swap, then detiles via
/// the Xenos Tiled2D formula.
pub fn decode_k8888_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
) -> Result<Vec<u8>, DecodeError> {
    if key.width == 0 || key.height == 0 {
        return Err(DecodeError::ZeroSize);
    }
    let w = key.width as u32;
    let h = key.height as u32;
    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
    let total_bytes = (pitch_aligned * h * 4) as usize;
    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
    if raw.len() < total_bytes {
        return Err(DecodeError::OutOfBounds);
    }
    apply_endian_32(&mut raw, key.endian);
    let mut linear = vec![0u8; (w * h * 4) as usize];
    if key.tiled {
        if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() {
            return Err(DecodeError::OutOfBounds);
        }
    } else {
        // Non-tiled copy row-by-row honoring pitch.
        for y in 0..h as usize {
            let src = y * (pitch_aligned as usize) * 4;
            let dst = y * (w as usize) * 4;
            linear[dst..dst + (w as usize) * 4]
                .copy_from_slice(&raw[src..src + (w as usize) * 4]);
        }
    }
    // Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After
    // endian.Swap8In32 guests' typical per-dword byte order becomes BGRA
    // in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu.
    for px in linear.chunks_exact_mut(4) {
        px.swap(0, 2);
    }
    Ok(linear)
}

/// Decode a DXT-compressed texture to raw block bytes (no format
/// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the
/// GPU does the actual decompression on upload).
///
/// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D
/// formula, with stride counted in blocks. `bytes_per_block` is 8 for
/// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5).
pub fn decode_dxt_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
    bytes_per_block: u32,
) -> Result<Vec<u8>, DecodeError> {
    if key.width == 0 || key.height == 0 {
        return Err(DecodeError::ZeroSize);
    }
    let block_w = 4u32;
    let block_h = 4u32;
    let w_blocks = (key.width as u32).div_ceil(block_w);
    let h_blocks = (key.height as u32).div_ceil(block_h);
    let pitch_blocks = tiled_address::align_pitch_to_macro_tile(
        (key.pitch_texels as u32).div_ceil(block_w),
    );
    let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize;
    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
    if raw.len() < total_bytes {
        return Err(DecodeError::OutOfBounds);
    }
    // DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar
    // u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word
    // order requires an endian swap at the u16/u32 level regardless of
    // which BC-family format.
    apply_endian_32(&mut raw, key.endian);

    let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize];
    if key.tiled {
        if tiled_address::detile_2d(
            &raw,
            &mut out,
            w_blocks,
            h_blocks,
            pitch_blocks,
            bytes_per_block,
        )
        .is_err()
        {
            return Err(DecodeError::OutOfBounds);
        }
    } else {
        for y in 0..h_blocks as usize {
            let src = y * (pitch_blocks as usize) * (bytes_per_block as usize);
            let dst = y * (w_blocks as usize) * (bytes_per_block as usize);
            out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)]
                .copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]);
        }
    }
    Ok(out)
}

/// BC1 / DXT1 — 8-byte blocks.
pub fn decode_dxt1_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
) -> Result<Vec<u8>, DecodeError> {
    decode_dxt_tiled(key, mem, 8)
}

/// BC2 / DXT2_3 — 16-byte blocks.
pub fn decode_dxt23_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
) -> Result<Vec<u8>, DecodeError> {
    decode_dxt_tiled(key, mem, 16)
}

/// BC3 / DXT4_5 — 16-byte blocks.
pub fn decode_dxt45_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
) -> Result<Vec<u8>, DecodeError> {
    decode_dxt_tiled(key, mem, 16)
}

/// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high
/// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of
/// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a
/// sampled texture format on every backend, so CPU-side conversion is
/// the safe path even if it's 2× the texture memory.
///
/// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes),
/// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear
/// output buffer.
pub fn decode_k565_tiled(
    key: &TextureKey,
    mem: &dyn xenia_memory::MemoryAccess,
) -> Result<Vec<u8>, DecodeError> {
    if key.width == 0 || key.height == 0 {
        return Err(DecodeError::ZeroSize);
    }
    let w = key.width as u32;
    let h = key.height as u32;
    // Pitch/block counts — block = 1 texel here, 2 bytes.
    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
    let total_bytes = (pitch_aligned * h * 2) as usize;
    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
    if raw.len() < total_bytes {
        return Err(DecodeError::OutOfBounds);
    }
    // 16-bit word order is endian-swap-sensitive.
    apply_endian_32(&mut raw, key.endian);
    // Step 1: detile (bytes_per_block=2, tile in blocks=texels).
    let mut linear_u16 = vec![0u8; (w * h * 2) as usize];
    if key.tiled {
        if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() {
            return Err(DecodeError::OutOfBounds);
        }
    } else {
        for y in 0..h as usize {
            let src = y * (pitch_aligned as usize) * 2;
            let dst = y * (w as usize) * 2;
            linear_u16[dst..dst + (w as usize) * 2]
                .copy_from_slice(&raw[src..src + (w as usize) * 2]);
        }
    }
    // Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16
    // is little-endian after `apply_endian_32` has normalized the word
    // order (we keep host-native byte ordering post-swap).
    let mut rgba = vec![0u8; (w * h * 4) as usize];
    for y in 0..h as usize {
        for x in 0..w as usize {
            let off = (y * w as usize + x) * 2;
            let lo = linear_u16[off];
            let hi = linear_u16[off + 1];
            let word = u16::from_le_bytes([lo, hi]);
            // 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4).
            // Expand to full-range u8: replicate high bits into low
            // (so 0b11111 → 0xFF, matching the standard 565→888 convention).
            let r5 = ((word >> 11) & 0x1F) as u8;
            let g6 = ((word >> 5) & 0x3F) as u8;
            let b5 = (word & 0x1F) as u8;
            let r = (r5 << 3) | (r5 >> 2);
            let g = (g6 << 2) | (g6 >> 4);
            let b = (b5 << 3) | (b5 >> 2);
            let o = (y * w as usize + x) * 4;
            rgba[o] = r;
            rgba[o + 1] = g;
            rgba[o + 2] = b;
            rgba[o + 3] = 0xFF;
        }
    }
    Ok(rgba)
}

/// Version-aware CPU-side texture cache. Entries are keyed on
/// `TextureKey.hash` and carry a `version_when_uploaded` watermark against
/// the guest memory's page-version counter. `ensure_cached` queries
/// `GuestMemory::max_page_version` over the texture's byte span; if the
/// span has been written since cache time, the entry is re-decoded.
pub struct TextureCache {
    entries: HashMap<TextureKey, CachedTexture>,
    /// Monotonic counter of decodes performed — HUD surface.
    pub decodes_total: u64,
    /// Count of stale-miss re-decodes.
    pub restale_total: u64,
}

impl Default for TextureCache {
    fn default() -> Self {
        Self::new()
    }
}

impl TextureCache {
    pub fn new() -> Self {
        Self {
            entries: HashMap::new(),
            decodes_total: 0,
            restale_total: 0,
        }
    }

    pub fn len(&self) -> usize {
        self.entries.len()
    }

    pub fn is_empty(&self) -> bool {
        self.entries.is_empty()
    }

    pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> {
        self.entries.get(key)
    }

    /// Return a cached (or freshly-decoded) texture. The caller supplies
    /// the current guest-memory page version covering the texture span;
    /// see [`max_page_version_for`].
    pub fn ensure_cached(
        &mut self,
        key: TextureKey,
        current_version: u64,
        mem: &dyn xenia_memory::MemoryAccess,
    ) -> Result<&CachedTexture, DecodeError> {
        // Fast path: fresh entry exists.
        if let Some(e) = self.entries.get(&key) {
            if e.version_when_uploaded >= current_version {
                return Ok(self.entries.get(&key).unwrap());
            }
            self.restale_total += 1;
        }
        let bytes = match key.format {
            TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?,
            TextureFormat::K565 => decode_k565_tiled(&key, mem)?,
            TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?,
            TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?,
            TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?,
            _ => return Err(DecodeError::UnsupportedFormat),
        };
        self.decodes_total += 1;
        let entry = CachedTexture {
            key,
            version_when_uploaded: current_version,
            bytes,
        };
        self.entries.insert(key, entry);
        Ok(self.entries.get(&key).unwrap())
    }

    pub fn byte_budget(&self) -> usize {
        self.entries.values().map(|e| e.byte_size()).sum()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use std::cell::Cell;

    struct FakeMem(Box<[Cell<u8>]>);
    impl FakeMem {
        fn from_vec(v: Vec<u8>) -> Self {
            FakeMem(v.into_iter().map(Cell::new).collect())
        }
    }
    impl xenia_memory::MemoryAccess for FakeMem {
        fn read_u8(&self, a: u32) -> u8 {
            self.0.get(a as usize).map(|c| c.get()).unwrap_or(0)
        }
        fn read_u16(&self, a: u32) -> u16 {
            u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)])
        }
        fn read_u32(&self, a: u32) -> u32 {
            u32::from_be_bytes([
                self.read_u8(a),
                self.read_u8(a + 1),
                self.read_u8(a + 2),
                self.read_u8(a + 3),
            ])
        }
        fn read_u64(&self, a: u32) -> u64 {
            u64::from_be_bytes([
                self.read_u8(a),
                self.read_u8(a + 1),
                self.read_u8(a + 2),
                self.read_u8(a + 3),
                self.read_u8(a + 4),
                self.read_u8(a + 5),
                self.read_u8(a + 6),
                self.read_u8(a + 7),
            ])
        }
        fn write_u8(&self, a: u32, v: u8) {
            if let Some(slot) = self.0.get(a as usize) {
                slot.set(v);
            }
        }
        fn write_u16(&self, a: u32, v: u16) {
            let b = v.to_be_bytes();
            self.write_u8(a, b[0]);
            self.write_u8(a + 1, b[1]);
        }
        fn write_u32(&self, a: u32, v: u32) {
            let b = v.to_be_bytes();
            for i in 0..4 {
                self.write_u8(a + i as u32, b[i]);
            }
        }
        fn write_u64(&self, a: u32, v: u64) {
            let b = v.to_be_bytes();
            for i in 0..8 {
                self.write_u8(a + i as u32, b[i]);
            }
        }
        fn translate(&self, _: u32) -> Option<*const u8> {
            None
        }
        fn translate_mut(&self, _: u32) -> Option<*mut u8> {
            None
        }
    }

    #[test]
    fn format_block_info_matches_canary_expectations() {
        assert_eq!(
            TextureFormat::K8888.block_info(),
            BlockInfo::new(1, 1, 4)
        );
        assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8));
        assert_eq!(
            TextureFormat::Dxt4_5.block_info(),
            BlockInfo::new(4, 4, 16)
        );
    }

    #[test]
    fn endian_swap_variants() {
        assert_eq!(Endian::None.swap32(0x11223344), 0x11223344);
        assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433);
        assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211);
        assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122);
    }

    #[test]
    fn decode_fetch_constant_rejects_empty() {
        let z = [0u32; 6];
        assert!(decode_fetch_constant(z).is_none());
    }

    #[test]
    fn decode_fetch_constant_parses_2d_k8888() {
        // Build a synthetic k_8_8_8_8 2D texture fetch constant:
        //   dword0: pitch_5=40 (1280/32), tiled=1, type=2
        //   dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12
        //   dword2: width-1=1279, height-1=719
        //   dword5: dimension=1 (2D)
        let d0 = 0x8000_0000 | (40u32 << 22) | 2;
        let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32;
        let d2 = 1279u32 | ((719u32) << 13);
        let d5 = 1u32 << 9;
        let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed");
        assert_eq!(k.format, TextureFormat::K8888);
        assert_eq!(k.endian, Endian::Swap8In32);
        assert_eq!(k.width, 1280);
        assert_eq!(k.height, 720);
        assert_eq!(k.dimension, Dimension::D2);
        assert!(k.tiled);
        assert_eq!(k.pitch_texels, 1280);
    }

    #[test]
    fn decode_k8888_roundtrip_linear() {
        // Build a 4×4 non-tiled image with pitch=32 (one macro-tile row).
        // Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a
        // big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must
        // be (x, y, y*4+x, 0xFF) in RGBA order.
        let w = 4u32;
        let h = 4u32;
        let pitch = 32u32;
        let mut bytes = vec![0u8; (pitch * h * 4) as usize];
        for y in 0..h {
            for x in 0..w {
                let off = ((y * pitch + x) * 4) as usize;
                let argb = (0xFFu32 << 24)
                    | ((x as u32) << 16)
                    | ((y as u32) << 8)
                    | ((y * 4 + x) as u32);
                bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes());
            }
        }
        let mem = FakeMem::from_vec(bytes);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::K8888,
            endian: Endian::Swap8In32,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: pitch as u16,
        };
        let out = decode_k8888_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 16 * 4);
        assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
        let off = ((3 * 4 + 3) * 4) as usize;
        assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]);
    }

    // ── First-Pixels M5 format tests ──────────────────────────────

    /// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block.
    /// Synthetic source of 0xDEADBEEF... bytes; assert the decoder
    /// returns the same bytes (passthrough after endian swap).
    #[test]
    fn decode_dxt23_small_roundtrip() {
        // 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32
        // (macro-tile-aligned) the block pitch is 8 (=32/4), and we
        // allocate 8*1*16 = 128 bytes of source.
        let mut bytes = vec![0u8; 128];
        for (i, b) in bytes.iter_mut().enumerate().take(16) {
            *b = i as u8;
        }
        let mem = FakeMem::from_vec(bytes);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::Dxt2_3,
            endian: Endian::None, // no swap — we can eyeball passthrough
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        let out = decode_dxt23_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 16); // 1 block × 16 bytes
        for i in 0..16 {
            assert_eq!(out[i], i as u8);
        }
    }

    /// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a
    /// parallel test prevents a regression that sneaks up via the
    /// generic `decode_dxt_tiled`.
    #[test]
    fn decode_dxt45_uses_16byte_blocks() {
        let mem = FakeMem::from_vec(vec![0xAAu8; 256]);
        let key = TextureKey {
            base_address: 0,
            width: 8,
            height: 4, // 2×1 blocks
            depth_or_slices: 1,
            format: TextureFormat::Dxt4_5,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        let out = decode_dxt45_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 2 * 16);
    }

    /// k_5_6_5: a single white texel (all bits set, 0xFFFF) should
    /// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red
    /// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0
    /// B=0 via the high-bit-replicate convention.
    #[test]
    fn decode_k565_texel_expansion() {
        // Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels
        // → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white),
        // texel[1] = 0xF800 (pure red).
        let mut bytes = vec![0u8; 64];
        // 0xFFFF
        bytes[0] = 0xFF;
        bytes[1] = 0xFF;
        // 0xF800 (big-endian memory): high byte 0xF8, low 0x00.
        // But after apply_endian_32(Endian::None) we use little-endian
        // word decoding — so memory must carry the bytes in LE order.
        bytes[2] = 0x00;
        bytes[3] = 0xF8;
        let mem = FakeMem::from_vec(bytes);
        let key = TextureKey {
            base_address: 0,
            width: 2,
            height: 1,
            depth_or_slices: 1,
            format: TextureFormat::K565,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        let out = decode_k565_tiled(&key, &mem).expect("decode");
        assert_eq!(out.len(), 2 * 4);
        // Texel 0: white.
        assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]);
        // Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF).
        assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]);
    }

    #[test]
    fn is_host_supported_covers_m5_formats() {
        assert!(TextureFormat::K8888.is_host_supported());
        assert!(TextureFormat::K565.is_host_supported());
        assert!(TextureFormat::Dxt1.is_host_supported());
        assert!(TextureFormat::Dxt2_3.is_host_supported());
        assert!(TextureFormat::Dxt4_5.is_host_supported());
        // Unsupported formats should still report false.
        assert!(!TextureFormat::K16.is_host_supported());
        assert!(!TextureFormat::K32Float.is_host_supported());
    }

    #[test]
    fn texture_cache_caches_and_reuses() {
        let mut cache = TextureCache::new();
        let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::K8888,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        cache.ensure_cached(key, 0, &mem).unwrap();
        assert_eq!(cache.decodes_total, 1);
        // Same version: should hit cache.
        cache.ensure_cached(key, 0, &mem).unwrap();
        assert_eq!(cache.decodes_total, 1);
        // Higher version: stale → re-decode.
        cache.ensure_cached(key, 1, &mem).unwrap();
        assert_eq!(cache.decodes_total, 2);
        assert_eq!(cache.restale_total, 1);
    }

    /// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey`
    /// → `ensure_cached` on fresh/version-bumped memory → stale re-decode.
    /// Mirrors what `vd_swap` does per frame.
    #[test]
    fn e2e_fetch_const_to_cache_with_versioning() {
        // 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned.
        let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2
        let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none
        let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3
        let d5 = 1u32 << 9; // 2D
        let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded");
        assert_eq!(key.format, TextureFormat::K8888);
        assert_eq!(key.width, 4);

        let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]);
        let mut cache = TextureCache::new();
        // v0 decode.
        let first = cache
            .ensure_cached(key, 0, &mem)
            .expect("initial decode")
            .clone();
        // Same version → cache hit.
        cache.ensure_cached(key, 0, &mem).expect("hit");
        assert_eq!(cache.decodes_total, 1);
        // Simulate the guest writing to the texture's pages: version bumps.
        for b in &mem.0[..16] {
            b.set(0xFF);
        }
        cache.ensure_cached(key, 1, &mem).expect("re-decode");
        assert_eq!(cache.decodes_total, 2);
        assert_eq!(cache.restale_total, 1);
        // Bytes differ from v0 (proof the re-decode happened).
        let second = cache.get(&key).unwrap();
        assert_ne!(first.bytes, second.bytes);
    }

    #[test]
    fn texture_cache_rejects_unsupported_format() {
        let mut cache = TextureCache::new();
        let mem = FakeMem::from_vec(vec![0u8; 1024]);
        let key = TextureKey {
            base_address: 0,
            width: 4,
            height: 4,
            depth_or_slices: 1,
            format: TextureFormat::K16,
            endian: Endian::None,
            dimension: Dimension::D2,
            tiled: false,
            pitch_texels: 32,
        };
        assert!(matches!(
            cache.ensure_cached(key, 0, &mem),
            Err(DecodeError::UnsupportedFormat)
        ));
    }
}