xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)
First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).
Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.
Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
970
crates/xenia-gpu/src/texture_cache.rs
Normal file
970
crates/xenia-gpu/src/texture_cache.rs
Normal file
@@ -0,0 +1,970 @@
|
||||
//! Texture cache — P5.
|
||||
//!
|
||||
//! Two-layer design mirroring canary's `TextureCache`:
|
||||
//!
|
||||
//! * **CPU layer** (this module): owns decoded, linear, host-endian texel
|
||||
//! byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the
|
||||
//! guest memory's page-version counter to decide whether the cached
|
||||
//! bytes are still fresh and re-decodes on miss or staleness.
|
||||
//! * **GPU layer** (xenia-ui `texture_cache_host`): owns the
|
||||
//! `wgpu::Texture` + `TextureView` for each cached key; pulls decoded
|
||||
//! bytes from this CPU layer on upload.
|
||||
//!
|
||||
//! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and
|
||||
//! `texture_info_formats.inl` for the format table.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::tiled_address;
|
||||
|
||||
/// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`.
|
||||
/// Values are the raw enum numbers the guest writes into
|
||||
/// `xe_gpu_texture_fetch_t.format`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
#[repr(u8)]
|
||||
pub enum TextureFormat {
|
||||
K1Reverse = 0,
|
||||
K1 = 1,
|
||||
K8 = 2,
|
||||
K1555 = 3,
|
||||
K565 = 4,
|
||||
K6_5_5 = 5,
|
||||
K8888 = 6,
|
||||
K1010102 = 7,
|
||||
K8_8 = 10,
|
||||
K4_4_4_4 = 15,
|
||||
K10_11_11 = 16,
|
||||
K11_11_10 = 17,
|
||||
Dxt1 = 18,
|
||||
Dxt2_3 = 19,
|
||||
Dxt4_5 = 20,
|
||||
K24_8 = 22,
|
||||
K24_8Float = 23,
|
||||
K16 = 24,
|
||||
K16_16 = 25,
|
||||
K16_16_16_16 = 26,
|
||||
K16Float = 30,
|
||||
K16_16Float = 31,
|
||||
K16_16_16_16Float = 32,
|
||||
K32 = 33,
|
||||
K32_32 = 34,
|
||||
K32_32_32_32 = 35,
|
||||
K32Float = 36,
|
||||
K32_32Float = 37,
|
||||
K32_32_32_32Float = 38,
|
||||
Unknown(u8),
|
||||
}
|
||||
|
||||
impl TextureFormat {
|
||||
pub fn from_raw(v: u8) -> Self {
|
||||
use TextureFormat::*;
|
||||
match v & 0x3F {
|
||||
0 => K1Reverse,
|
||||
1 => K1,
|
||||
2 => K8,
|
||||
3 => K1555,
|
||||
4 => K565,
|
||||
5 => K6_5_5,
|
||||
6 => K8888,
|
||||
7 => K1010102,
|
||||
10 => K8_8,
|
||||
15 => K4_4_4_4,
|
||||
16 => K10_11_11,
|
||||
17 => K11_11_10,
|
||||
18 => Dxt1,
|
||||
19 => Dxt2_3,
|
||||
20 => Dxt4_5,
|
||||
22 => K24_8,
|
||||
23 => K24_8Float,
|
||||
24 => K16,
|
||||
25 => K16_16,
|
||||
26 => K16_16_16_16,
|
||||
30 => K16Float,
|
||||
31 => K16_16Float,
|
||||
32 => K16_16_16_16Float,
|
||||
33 => K32,
|
||||
34 => K32_32,
|
||||
35 => K32_32_32_32,
|
||||
36 => K32Float,
|
||||
37 => K32_32Float,
|
||||
38 => K32_32_32_32Float,
|
||||
other => Unknown(other),
|
||||
}
|
||||
}
|
||||
|
||||
/// Block width/height in texels + bytes-per-block. For uncompressed
|
||||
/// formats block_w = block_h = 1. For DXT formats block_w = block_h =
|
||||
/// 4 (one 4×4 compressed block).
|
||||
pub fn block_info(self) -> BlockInfo {
|
||||
use TextureFormat::*;
|
||||
match self {
|
||||
K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte
|
||||
K8 => BlockInfo::new(1, 1, 1),
|
||||
K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2),
|
||||
K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16
|
||||
| K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4),
|
||||
K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8),
|
||||
K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16),
|
||||
Dxt1 => BlockInfo::new(4, 4, 8),
|
||||
Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16),
|
||||
Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback
|
||||
}
|
||||
}
|
||||
|
||||
/// True iff this format lands on a wgpu texture format we can
|
||||
/// natively bind — no CPU-side conversion per frame required. M5
|
||||
/// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still
|
||||
/// counts as supported for the host-cache wiring), `k_DXT2_3`
|
||||
/// (BC2), and `k_DXT4_5` (BC3).
|
||||
pub fn is_host_supported(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
TextureFormat::K8888
|
||||
| TextureFormat::K565
|
||||
| TextureFormat::Dxt1
|
||||
| TextureFormat::Dxt2_3
|
||||
| TextureFormat::Dxt4_5
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct BlockInfo {
|
||||
pub block_w: u8,
|
||||
pub block_h: u8,
|
||||
pub bytes_per_block: u8,
|
||||
}
|
||||
|
||||
impl BlockInfo {
|
||||
pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self {
|
||||
Self {
|
||||
block_w,
|
||||
block_h,
|
||||
bytes_per_block,
|
||||
}
|
||||
}
|
||||
pub fn log2_bpb(self) -> u32 {
|
||||
match self.bytes_per_block {
|
||||
1 => 0,
|
||||
2 => 1,
|
||||
4 => 2,
|
||||
8 => 3,
|
||||
16 => 4,
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Endian {
|
||||
None = 0,
|
||||
Swap8In16 = 1,
|
||||
Swap8In32 = 2,
|
||||
Swap16In32 = 3,
|
||||
}
|
||||
|
||||
impl Endian {
|
||||
pub fn from_raw(v: u8) -> Self {
|
||||
match v & 0x3 {
|
||||
1 => Endian::Swap8In16,
|
||||
2 => Endian::Swap8In32,
|
||||
3 => Endian::Swap16In32,
|
||||
_ => Endian::None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply this endian's byte swap to one 32-bit unit. Matches canary's
|
||||
/// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls
|
||||
/// the same mask-shift pattern.
|
||||
pub fn swap32(self, v: u32) -> u32 {
|
||||
match self {
|
||||
Endian::None => v,
|
||||
Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8),
|
||||
Endian::Swap8In32 => v.swap_bytes(),
|
||||
Endian::Swap16In32 => v.rotate_right(16),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Texture dimensionality (`xenos::DataDimension`).
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Dimension {
|
||||
D1 = 0,
|
||||
D2 = 1,
|
||||
D3Stacked = 2,
|
||||
Cube = 3,
|
||||
}
|
||||
|
||||
impl Dimension {
|
||||
pub fn from_raw(v: u8) -> Self {
|
||||
match v & 0x3 {
|
||||
1 => Dimension::D2,
|
||||
2 => Dimension::D3Stacked,
|
||||
3 => Dimension::Cube,
|
||||
_ => Dimension::D1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Identity of a cached texture. Matches canary's `TextureCache::TextureKey`
|
||||
/// at the semantic level — we exclude mip/border state for P5 since neither
|
||||
/// is populated yet.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct TextureKey {
|
||||
/// Guest physical base (byte address — already shifted left by 12 from
|
||||
/// the fetch-constant `base_address` field).
|
||||
pub base_address: u32,
|
||||
pub width: u16,
|
||||
pub height: u16,
|
||||
pub depth_or_slices: u16,
|
||||
pub format: TextureFormat,
|
||||
pub endian: Endian,
|
||||
pub dimension: Dimension,
|
||||
pub tiled: bool,
|
||||
/// Row pitch in texels, already aligned to 32. Canary stores pitch/32
|
||||
/// in the fetch constant; we keep the raw texel count to avoid
|
||||
/// callers remembering to shift.
|
||||
pub pitch_texels: u16,
|
||||
}
|
||||
|
||||
/// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`).
|
||||
/// Returns `None` if the constant is obviously unset (all zeros) or if
|
||||
/// `type` is not the texture-constant marker.
|
||||
pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option<TextureKey> {
|
||||
let d0 = dwords[0];
|
||||
let d1 = dwords[1];
|
||||
let d2 = dwords[2];
|
||||
let d5 = dwords[5];
|
||||
|
||||
// type: low 2 bits of dword 0 should be 2 (texture) per canary —
|
||||
// 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so
|
||||
// `None` filters it out here.
|
||||
let ty = d0 & 0x3;
|
||||
if d0 == 0 && d1 == 0 {
|
||||
return None;
|
||||
}
|
||||
// Not a texture constant (e.g. 0 = vertex fetch constant reused).
|
||||
if ty != 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels
|
||||
let tiled = ((d0 >> 31) & 1) != 0;
|
||||
let format = TextureFormat::from_raw((d1 & 0x3F) as u8);
|
||||
let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8);
|
||||
let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted.
|
||||
let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8);
|
||||
|
||||
// Size decode depends on dimension.
|
||||
let (width, height, depth) = match dim {
|
||||
Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16),
|
||||
Dimension::D2 => (
|
||||
(d2 & 0x1FFF) as u16 + 1,
|
||||
((d2 >> 13) & 0x1FFF) as u16 + 1,
|
||||
((d2 >> 26) & 0x3F) as u16 + 1,
|
||||
),
|
||||
Dimension::D3Stacked | Dimension::Cube => (
|
||||
(d2 & 0x7FF) as u16 + 1,
|
||||
((d2 >> 11) & 0x7FF) as u16 + 1,
|
||||
((d2 >> 22) & 0x3FF) as u16 + 1,
|
||||
),
|
||||
};
|
||||
|
||||
Some(TextureKey {
|
||||
base_address,
|
||||
width,
|
||||
height,
|
||||
depth_or_slices: depth,
|
||||
format,
|
||||
endian,
|
||||
dimension: dim,
|
||||
tiled,
|
||||
pitch_texels: ((pitch_5 as u16) * 32).max(width),
|
||||
})
|
||||
}
|
||||
|
||||
/// Decoded, linear, host-endian texture bytes ready for wgpu upload.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CachedTexture {
|
||||
pub key: TextureKey,
|
||||
pub version_when_uploaded: u64,
|
||||
/// Tightly packed. Layout depends on `key.format`:
|
||||
/// - `K8888` → `width*height*4` bytes in Rgba8Unorm order.
|
||||
/// - `Dxt1` → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after
|
||||
/// block-level detile + dword-endian swap.
|
||||
pub bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
impl CachedTexture {
|
||||
pub fn byte_size(&self) -> usize {
|
||||
self.bytes.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors that can happen during decode. The `ensure_cached` caller maps
|
||||
/// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when
|
||||
/// a texture fell back.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DecodeError {
|
||||
UnsupportedFormat,
|
||||
OutOfBounds,
|
||||
ZeroSize,
|
||||
}
|
||||
|
||||
/// Read `len` bytes from guest memory starting at `addr`. Returns `None`
|
||||
/// if the span would exceed the memory's reported end; otherwise returns
|
||||
/// a freshly-allocated buffer with the bytes.
|
||||
///
|
||||
/// The `MemoryAccess` trait exposes per-byte reads only; we batch them in
|
||||
/// a single pass to avoid the per-byte virtual dispatch overhead for large
|
||||
/// textures (1 MiB frontbuffer = 1M dispatch calls).
|
||||
pub fn read_guest_bytes(
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
addr: u32,
|
||||
len: usize,
|
||||
) -> Vec<u8> {
|
||||
let mut out = Vec::with_capacity(len);
|
||||
for i in 0..len {
|
||||
let a = addr.wrapping_add(i as u32);
|
||||
out.push(mem.read_u8(a));
|
||||
if a < addr {
|
||||
// 32-bit overflow; unmap the tail.
|
||||
break;
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Byte-swap the 32-bit dwords of `buf` in place according to `endian`.
|
||||
/// `buf.len()` should be a multiple of 4; tail bytes are left untouched.
|
||||
pub fn apply_endian_32(buf: &mut [u8], endian: Endian) {
|
||||
if matches!(endian, Endian::None) {
|
||||
return;
|
||||
}
|
||||
let mut i = 0;
|
||||
while i + 4 <= buf.len() {
|
||||
let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]);
|
||||
let swapped = endian.swap32(v);
|
||||
buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes());
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes.
|
||||
/// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory →
|
||||
/// we emit RGBA for wgpu) and the declared endian swap, then detiles via
|
||||
/// the Xenos Tiled2D formula.
|
||||
pub fn decode_k8888_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
if key.width == 0 || key.height == 0 {
|
||||
return Err(DecodeError::ZeroSize);
|
||||
}
|
||||
let w = key.width as u32;
|
||||
let h = key.height as u32;
|
||||
let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
|
||||
let total_bytes = (pitch_aligned * h * 4) as usize;
|
||||
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||
if raw.len() < total_bytes {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
apply_endian_32(&mut raw, key.endian);
|
||||
let mut linear = vec![0u8; (w * h * 4) as usize];
|
||||
if key.tiled {
|
||||
if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
} else {
|
||||
// Non-tiled copy row-by-row honoring pitch.
|
||||
for y in 0..h as usize {
|
||||
let src = y * (pitch_aligned as usize) * 4;
|
||||
let dst = y * (w as usize) * 4;
|
||||
linear[dst..dst + (w as usize) * 4]
|
||||
.copy_from_slice(&raw[src..src + (w as usize) * 4]);
|
||||
}
|
||||
}
|
||||
// Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After
|
||||
// endian.Swap8In32 guests' typical per-dword byte order becomes BGRA
|
||||
// in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu.
|
||||
for px in linear.chunks_exact_mut(4) {
|
||||
px.swap(0, 2);
|
||||
}
|
||||
Ok(linear)
|
||||
}
|
||||
|
||||
/// Decode a DXT-compressed texture to raw block bytes (no format
|
||||
/// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the
|
||||
/// GPU does the actual decompression on upload).
|
||||
///
|
||||
/// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D
|
||||
/// formula, with stride counted in blocks. `bytes_per_block` is 8 for
|
||||
/// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5).
|
||||
pub fn decode_dxt_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
bytes_per_block: u32,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
if key.width == 0 || key.height == 0 {
|
||||
return Err(DecodeError::ZeroSize);
|
||||
}
|
||||
let block_w = 4u32;
|
||||
let block_h = 4u32;
|
||||
let w_blocks = (key.width as u32).div_ceil(block_w);
|
||||
let h_blocks = (key.height as u32).div_ceil(block_h);
|
||||
let pitch_blocks = tiled_address::align_pitch_to_macro_tile(
|
||||
(key.pitch_texels as u32).div_ceil(block_w),
|
||||
);
|
||||
let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize;
|
||||
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||
if raw.len() < total_bytes {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
// DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar
|
||||
// u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word
|
||||
// order requires an endian swap at the u16/u32 level regardless of
|
||||
// which BC-family format.
|
||||
apply_endian_32(&mut raw, key.endian);
|
||||
|
||||
let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize];
|
||||
if key.tiled {
|
||||
if tiled_address::detile_2d(
|
||||
&raw,
|
||||
&mut out,
|
||||
w_blocks,
|
||||
h_blocks,
|
||||
pitch_blocks,
|
||||
bytes_per_block,
|
||||
)
|
||||
.is_err()
|
||||
{
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
} else {
|
||||
for y in 0..h_blocks as usize {
|
||||
let src = y * (pitch_blocks as usize) * (bytes_per_block as usize);
|
||||
let dst = y * (w_blocks as usize) * (bytes_per_block as usize);
|
||||
out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)]
|
||||
.copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]);
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// BC1 / DXT1 — 8-byte blocks.
|
||||
pub fn decode_dxt1_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
decode_dxt_tiled(key, mem, 8)
|
||||
}
|
||||
|
||||
/// BC2 / DXT2_3 — 16-byte blocks.
|
||||
pub fn decode_dxt23_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
decode_dxt_tiled(key, mem, 16)
|
||||
}
|
||||
|
||||
/// BC3 / DXT4_5 — 16-byte blocks.
|
||||
pub fn decode_dxt45_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
decode_dxt_tiled(key, mem, 16)
|
||||
}
|
||||
|
||||
/// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high
|
||||
/// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of
|
||||
/// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a
|
||||
/// sampled texture format on every backend, so CPU-side conversion is
|
||||
/// the safe path even if it's 2× the texture memory.
|
||||
///
|
||||
/// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes),
|
||||
/// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear
|
||||
/// output buffer.
|
||||
pub fn decode_k565_tiled(
|
||||
key: &TextureKey,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<Vec<u8>, DecodeError> {
|
||||
if key.width == 0 || key.height == 0 {
|
||||
return Err(DecodeError::ZeroSize);
|
||||
}
|
||||
let w = key.width as u32;
|
||||
let h = key.height as u32;
|
||||
// Pitch/block counts — block = 1 texel here, 2 bytes.
|
||||
let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
|
||||
let total_bytes = (pitch_aligned * h * 2) as usize;
|
||||
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||
if raw.len() < total_bytes {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
// 16-bit word order is endian-swap-sensitive.
|
||||
apply_endian_32(&mut raw, key.endian);
|
||||
// Step 1: detile (bytes_per_block=2, tile in blocks=texels).
|
||||
let mut linear_u16 = vec![0u8; (w * h * 2) as usize];
|
||||
if key.tiled {
|
||||
if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() {
|
||||
return Err(DecodeError::OutOfBounds);
|
||||
}
|
||||
} else {
|
||||
for y in 0..h as usize {
|
||||
let src = y * (pitch_aligned as usize) * 2;
|
||||
let dst = y * (w as usize) * 2;
|
||||
linear_u16[dst..dst + (w as usize) * 2]
|
||||
.copy_from_slice(&raw[src..src + (w as usize) * 2]);
|
||||
}
|
||||
}
|
||||
// Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16
|
||||
// is little-endian after `apply_endian_32` has normalized the word
|
||||
// order (we keep host-native byte ordering post-swap).
|
||||
let mut rgba = vec![0u8; (w * h * 4) as usize];
|
||||
for y in 0..h as usize {
|
||||
for x in 0..w as usize {
|
||||
let off = (y * w as usize + x) * 2;
|
||||
let lo = linear_u16[off];
|
||||
let hi = linear_u16[off + 1];
|
||||
let word = u16::from_le_bytes([lo, hi]);
|
||||
// 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4).
|
||||
// Expand to full-range u8: replicate high bits into low
|
||||
// (so 0b11111 → 0xFF, matching the standard 565→888 convention).
|
||||
let r5 = ((word >> 11) & 0x1F) as u8;
|
||||
let g6 = ((word >> 5) & 0x3F) as u8;
|
||||
let b5 = (word & 0x1F) as u8;
|
||||
let r = (r5 << 3) | (r5 >> 2);
|
||||
let g = (g6 << 2) | (g6 >> 4);
|
||||
let b = (b5 << 3) | (b5 >> 2);
|
||||
let o = (y * w as usize + x) * 4;
|
||||
rgba[o] = r;
|
||||
rgba[o + 1] = g;
|
||||
rgba[o + 2] = b;
|
||||
rgba[o + 3] = 0xFF;
|
||||
}
|
||||
}
|
||||
Ok(rgba)
|
||||
}
|
||||
|
||||
/// Version-aware CPU-side texture cache. Entries are keyed on
|
||||
/// `TextureKey.hash` and carry a `version_when_uploaded` watermark against
|
||||
/// the guest memory's page-version counter. `ensure_cached` queries
|
||||
/// `GuestMemory::max_page_version` over the texture's byte span; if the
|
||||
/// span has been written since cache time, the entry is re-decoded.
|
||||
pub struct TextureCache {
|
||||
entries: HashMap<TextureKey, CachedTexture>,
|
||||
/// Monotonic counter of decodes performed — HUD surface.
|
||||
pub decodes_total: u64,
|
||||
/// Count of stale-miss re-decodes.
|
||||
pub restale_total: u64,
|
||||
}
|
||||
|
||||
impl Default for TextureCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl TextureCache {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
entries: HashMap::new(),
|
||||
decodes_total: 0,
|
||||
restale_total: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.entries.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.entries.is_empty()
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> {
|
||||
self.entries.get(key)
|
||||
}
|
||||
|
||||
/// Return a cached (or freshly-decoded) texture. The caller supplies
|
||||
/// the current guest-memory page version covering the texture span;
|
||||
/// see [`max_page_version_for`].
|
||||
pub fn ensure_cached(
|
||||
&mut self,
|
||||
key: TextureKey,
|
||||
current_version: u64,
|
||||
mem: &dyn xenia_memory::MemoryAccess,
|
||||
) -> Result<&CachedTexture, DecodeError> {
|
||||
// Fast path: fresh entry exists.
|
||||
if let Some(e) = self.entries.get(&key) {
|
||||
if e.version_when_uploaded >= current_version {
|
||||
return Ok(self.entries.get(&key).unwrap());
|
||||
}
|
||||
self.restale_total += 1;
|
||||
}
|
||||
let bytes = match key.format {
|
||||
TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?,
|
||||
TextureFormat::K565 => decode_k565_tiled(&key, mem)?,
|
||||
TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?,
|
||||
TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?,
|
||||
TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?,
|
||||
_ => return Err(DecodeError::UnsupportedFormat),
|
||||
};
|
||||
self.decodes_total += 1;
|
||||
let entry = CachedTexture {
|
||||
key,
|
||||
version_when_uploaded: current_version,
|
||||
bytes,
|
||||
};
|
||||
self.entries.insert(key, entry);
|
||||
Ok(self.entries.get(&key).unwrap())
|
||||
}
|
||||
|
||||
pub fn byte_budget(&self) -> usize {
|
||||
self.entries.values().map(|e| e.byte_size()).sum()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
struct FakeMem(Box<[Cell<u8>]>);
|
||||
impl FakeMem {
|
||||
fn from_vec(v: Vec<u8>) -> Self {
|
||||
FakeMem(v.into_iter().map(Cell::new).collect())
|
||||
}
|
||||
}
|
||||
impl xenia_memory::MemoryAccess for FakeMem {
|
||||
fn read_u8(&self, a: u32) -> u8 {
|
||||
self.0.get(a as usize).map(|c| c.get()).unwrap_or(0)
|
||||
}
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
u32::from_be_bytes([
|
||||
self.read_u8(a),
|
||||
self.read_u8(a + 1),
|
||||
self.read_u8(a + 2),
|
||||
self.read_u8(a + 3),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
u64::from_be_bytes([
|
||||
self.read_u8(a),
|
||||
self.read_u8(a + 1),
|
||||
self.read_u8(a + 2),
|
||||
self.read_u8(a + 3),
|
||||
self.read_u8(a + 4),
|
||||
self.read_u8(a + 5),
|
||||
self.read_u8(a + 6),
|
||||
self.read_u8(a + 7),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) {
|
||||
if let Some(slot) = self.0.get(a as usize) {
|
||||
slot.set(v);
|
||||
}
|
||||
}
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let b = v.to_be_bytes();
|
||||
self.write_u8(a, b[0]);
|
||||
self.write_u8(a + 1, b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let b = v.to_be_bytes();
|
||||
for i in 0..4 {
|
||||
self.write_u8(a + i as u32, b[i]);
|
||||
}
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let b = v.to_be_bytes();
|
||||
for i in 0..8 {
|
||||
self.write_u8(a + i as u32, b[i]);
|
||||
}
|
||||
}
|
||||
fn translate(&self, _: u32) -> Option<*const u8> {
|
||||
None
|
||||
}
|
||||
fn translate_mut(&self, _: u32) -> Option<*mut u8> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_block_info_matches_canary_expectations() {
|
||||
assert_eq!(
|
||||
TextureFormat::K8888.block_info(),
|
||||
BlockInfo::new(1, 1, 4)
|
||||
);
|
||||
assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8));
|
||||
assert_eq!(
|
||||
TextureFormat::Dxt4_5.block_info(),
|
||||
BlockInfo::new(4, 4, 16)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn endian_swap_variants() {
|
||||
assert_eq!(Endian::None.swap32(0x11223344), 0x11223344);
|
||||
assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433);
|
||||
assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211);
|
||||
assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_fetch_constant_rejects_empty() {
|
||||
let z = [0u32; 6];
|
||||
assert!(decode_fetch_constant(z).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_fetch_constant_parses_2d_k8888() {
|
||||
// Build a synthetic k_8_8_8_8 2D texture fetch constant:
|
||||
// dword0: pitch_5=40 (1280/32), tiled=1, type=2
|
||||
// dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12
|
||||
// dword2: width-1=1279, height-1=719
|
||||
// dword5: dimension=1 (2D)
|
||||
let d0 = 0x8000_0000 | (40u32 << 22) | 2;
|
||||
let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32;
|
||||
let d2 = 1279u32 | ((719u32) << 13);
|
||||
let d5 = 1u32 << 9;
|
||||
let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed");
|
||||
assert_eq!(k.format, TextureFormat::K8888);
|
||||
assert_eq!(k.endian, Endian::Swap8In32);
|
||||
assert_eq!(k.width, 1280);
|
||||
assert_eq!(k.height, 720);
|
||||
assert_eq!(k.dimension, Dimension::D2);
|
||||
assert!(k.tiled);
|
||||
assert_eq!(k.pitch_texels, 1280);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_k8888_roundtrip_linear() {
|
||||
// Build a 4×4 non-tiled image with pitch=32 (one macro-tile row).
|
||||
// Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a
|
||||
// big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must
|
||||
// be (x, y, y*4+x, 0xFF) in RGBA order.
|
||||
let w = 4u32;
|
||||
let h = 4u32;
|
||||
let pitch = 32u32;
|
||||
let mut bytes = vec![0u8; (pitch * h * 4) as usize];
|
||||
for y in 0..h {
|
||||
for x in 0..w {
|
||||
let off = ((y * pitch + x) * 4) as usize;
|
||||
let argb = (0xFFu32 << 24)
|
||||
| ((x as u32) << 16)
|
||||
| ((y as u32) << 8)
|
||||
| ((y * 4 + x) as u32);
|
||||
bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes());
|
||||
}
|
||||
}
|
||||
let mem = FakeMem::from_vec(bytes);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K8888,
|
||||
endian: Endian::Swap8In32,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: pitch as u16,
|
||||
};
|
||||
let out = decode_k8888_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 16 * 4);
|
||||
assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
|
||||
let off = ((3 * 4 + 3) * 4) as usize;
|
||||
assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]);
|
||||
}
|
||||
|
||||
// ── First-Pixels M5 format tests ──────────────────────────────
|
||||
|
||||
/// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block.
|
||||
/// Synthetic source of 0xDEADBEEF... bytes; assert the decoder
|
||||
/// returns the same bytes (passthrough after endian swap).
|
||||
#[test]
|
||||
fn decode_dxt23_small_roundtrip() {
|
||||
// 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32
|
||||
// (macro-tile-aligned) the block pitch is 8 (=32/4), and we
|
||||
// allocate 8*1*16 = 128 bytes of source.
|
||||
let mut bytes = vec![0u8; 128];
|
||||
for (i, b) in bytes.iter_mut().enumerate().take(16) {
|
||||
*b = i as u8;
|
||||
}
|
||||
let mem = FakeMem::from_vec(bytes);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::Dxt2_3,
|
||||
endian: Endian::None, // no swap — we can eyeball passthrough
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
let out = decode_dxt23_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 16); // 1 block × 16 bytes
|
||||
for i in 0..16 {
|
||||
assert_eq!(out[i], i as u8);
|
||||
}
|
||||
}
|
||||
|
||||
/// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a
|
||||
/// parallel test prevents a regression that sneaks up via the
|
||||
/// generic `decode_dxt_tiled`.
|
||||
#[test]
|
||||
fn decode_dxt45_uses_16byte_blocks() {
|
||||
let mem = FakeMem::from_vec(vec![0xAAu8; 256]);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 8,
|
||||
height: 4, // 2×1 blocks
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::Dxt4_5,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
let out = decode_dxt45_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 2 * 16);
|
||||
}
|
||||
|
||||
/// k_5_6_5: a single white texel (all bits set, 0xFFFF) should
|
||||
/// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red
|
||||
/// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0
|
||||
/// B=0 via the high-bit-replicate convention.
|
||||
#[test]
|
||||
fn decode_k565_texel_expansion() {
|
||||
// Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels
|
||||
// → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white),
|
||||
// texel[1] = 0xF800 (pure red).
|
||||
let mut bytes = vec![0u8; 64];
|
||||
// 0xFFFF
|
||||
bytes[0] = 0xFF;
|
||||
bytes[1] = 0xFF;
|
||||
// 0xF800 (big-endian memory): high byte 0xF8, low 0x00.
|
||||
// But after apply_endian_32(Endian::None) we use little-endian
|
||||
// word decoding — so memory must carry the bytes in LE order.
|
||||
bytes[2] = 0x00;
|
||||
bytes[3] = 0xF8;
|
||||
let mem = FakeMem::from_vec(bytes);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 2,
|
||||
height: 1,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K565,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
let out = decode_k565_tiled(&key, &mem).expect("decode");
|
||||
assert_eq!(out.len(), 2 * 4);
|
||||
// Texel 0: white.
|
||||
assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]);
|
||||
// Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF).
|
||||
assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_host_supported_covers_m5_formats() {
|
||||
assert!(TextureFormat::K8888.is_host_supported());
|
||||
assert!(TextureFormat::K565.is_host_supported());
|
||||
assert!(TextureFormat::Dxt1.is_host_supported());
|
||||
assert!(TextureFormat::Dxt2_3.is_host_supported());
|
||||
assert!(TextureFormat::Dxt4_5.is_host_supported());
|
||||
// Unsupported formats should still report false.
|
||||
assert!(!TextureFormat::K16.is_host_supported());
|
||||
assert!(!TextureFormat::K32Float.is_host_supported());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn texture_cache_caches_and_reuses() {
|
||||
let mut cache = TextureCache::new();
|
||||
let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K8888,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
cache.ensure_cached(key, 0, &mem).unwrap();
|
||||
assert_eq!(cache.decodes_total, 1);
|
||||
// Same version: should hit cache.
|
||||
cache.ensure_cached(key, 0, &mem).unwrap();
|
||||
assert_eq!(cache.decodes_total, 1);
|
||||
// Higher version: stale → re-decode.
|
||||
cache.ensure_cached(key, 1, &mem).unwrap();
|
||||
assert_eq!(cache.decodes_total, 2);
|
||||
assert_eq!(cache.restale_total, 1);
|
||||
}
|
||||
|
||||
/// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey`
|
||||
/// → `ensure_cached` on fresh/version-bumped memory → stale re-decode.
|
||||
/// Mirrors what `vd_swap` does per frame.
|
||||
#[test]
|
||||
fn e2e_fetch_const_to_cache_with_versioning() {
|
||||
// 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned.
|
||||
let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2
|
||||
let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none
|
||||
let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3
|
||||
let d5 = 1u32 << 9; // 2D
|
||||
let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded");
|
||||
assert_eq!(key.format, TextureFormat::K8888);
|
||||
assert_eq!(key.width, 4);
|
||||
|
||||
let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]);
|
||||
let mut cache = TextureCache::new();
|
||||
// v0 decode.
|
||||
let first = cache
|
||||
.ensure_cached(key, 0, &mem)
|
||||
.expect("initial decode")
|
||||
.clone();
|
||||
// Same version → cache hit.
|
||||
cache.ensure_cached(key, 0, &mem).expect("hit");
|
||||
assert_eq!(cache.decodes_total, 1);
|
||||
// Simulate the guest writing to the texture's pages: version bumps.
|
||||
for b in &mem.0[..16] {
|
||||
b.set(0xFF);
|
||||
}
|
||||
cache.ensure_cached(key, 1, &mem).expect("re-decode");
|
||||
assert_eq!(cache.decodes_total, 2);
|
||||
assert_eq!(cache.restale_total, 1);
|
||||
// Bytes differ from v0 (proof the re-decode happened).
|
||||
let second = cache.get(&key).unwrap();
|
||||
assert_ne!(first.bytes, second.bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn texture_cache_rejects_unsupported_format() {
|
||||
let mut cache = TextureCache::new();
|
||||
let mem = FakeMem::from_vec(vec![0u8; 1024]);
|
||||
let key = TextureKey {
|
||||
base_address: 0,
|
||||
width: 4,
|
||||
height: 4,
|
||||
depth_or_slices: 1,
|
||||
format: TextureFormat::K16,
|
||||
endian: Endian::None,
|
||||
dimension: Dimension::D2,
|
||||
tiled: false,
|
||||
pitch_texels: 32,
|
||||
};
|
||||
assert!(matches!(
|
||||
cache.ensure_cached(key, 0, &mem),
|
||||
Err(DecodeError::UnsupportedFormat)
|
||||
));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user