xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view, ring_drain, pm4) drains the command processor; gpu_system owns the threaded backend (DrainFence RPC + parker/fence helpers from M1) and the MMIO-mapped register block (mmio_region). Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode the Xbox 360 microcode, translator.rs lowers it onto the WGSL xenos_interp interpreter shader (shaders/xenos_interp.wgsl). shader_metrics.rs counts decode/translate work. Render state: draw_state, primitive, render_target_cache, texture_cache, tiled_address (Xenos's swizzled tiled-memory layout), xenos_constants (register field constants), edram (the 10 MiB EDRAM model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs owns the typed GPU-resource handles the kernel hands out. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions
--- a/crates/xenia-gpu/src/tiled_address.rs
+++ b/crates/xenia-gpu/src/tiled_address.rs
@@ -0,0 +1,178 @@
+//! Xenos tiled-texture address formula (2D, Tiled2D layout).
+//!
+//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
+//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
+//! macro-tile pattern with bank+pipe interleave for its internal DRAM
+//! banks; this formula inverts that so we can read pixels out in linear
+//! order, given the tiled source buffer.
+//!
+//! We use this in two places during P4:
+//!  - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
+//!  - Any place we need to read tiled guest memory into a host-linear
+//!    buffer for CPU-side conversion before upload.
+
+/// Tile size constants from canary.
+pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
+pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px
+
+/// Canary's `TiledCombine` helper — reassembles the DRAM address from the
+/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
+#[inline]
+fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
+    (y_lsb << 4)
+        | (pipe << 6)
+        | (bank << 11)
+        | (outer_inner_bytes & 0b1111)
+        | (((outer_inner_bytes >> 4) & 0b1) << 5)
+        | (((outer_inner_bytes >> 5) & 0b111) << 8)
+        | ((outer_inner_bytes >> 8) << 12)
+}
+
+/// 2D tiled offset in bytes from (x, y) into a tiled surface with
+/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
+/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
+/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
+/// `u32` rather than canary's signed `int` since our callers stay in
+/// unsigned arithmetic.
+///
+/// This is the canonical formula — do not simplify without re-reading
+/// `texture_address.h:190-208`; the bit-interleave cannot be expressed
+/// as a linear function.
+pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
+    let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
+    // Outer: which 32×32 macro tile we're in.
+    let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
+        + (x >> MACRO_TILE_WIDTH_LOG2))
+        << 6;
+    // Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
+    // because that bit becomes the `y_lsb` interleave bit below).
+    let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
+    let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;
+
+    let bank = (y >> 4) & 0b1;
+    let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
+    let y_lsb = y & 1;
+
+    tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
+}
+
+/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
+/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
+#[inline]
+pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
+    let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
+    (pitch_pixels + mask) & !mask
+}
+
+/// Detile a 2D tiled surface into a linear destination buffer. The
+/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
+/// one block in the tiled source, and the detiler writes it into `dst`
+/// at the linear (x, y) position.
+///
+/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
+/// a single 16-bit block, etc.). `dst` must be at least
+/// `width * height * bpp` bytes long.
+///
+/// Returns `Err(())` if the source doesn't contain enough bytes for the
+/// largest offset the formula would produce (defensive — callers can
+/// downgrade silently).
+pub fn detile_2d(
+    src: &[u8],
+    dst: &mut [u8],
+    width: u32,
+    height: u32,
+    pitch_pixels: u32,
+    bpp: u32,
+) -> Result<(), ()> {
+    let bpp_log2 = bpp.trailing_zeros();
+    let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
+    let dst_pitch_bytes = (width * bpp) as usize;
+    let bpp_u = bpp as usize;
+
+    for y in 0..height {
+        for x in 0..width {
+            let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
+            if src_off + bpp_u > src.len() {
+                return Err(());
+            }
+            let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
+            if dst_off + bpp_u > dst.len() {
+                return Err(());
+            }
+            dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
+    #[test]
+    fn origin_is_zero() {
+        assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
+    }
+
+    /// Round-trip: detiling a tiled buffer that was filled using the same
+    /// formula produces the identity linear image.
+    #[test]
+    fn roundtrip_small_pattern() {
+        let w = 32u32;
+        let h = 16u32;
+        let bpp = 4u32;
+        let pitch = align_pitch_to_macro_tile(w);
+        // Allocate a tiled buffer large enough for the largest offset.
+        let max_off = (0..h)
+            .flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
+            .max()
+            .unwrap();
+        let mut tiled = vec![0u8; max_off];
+        // Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
+        // each logical (x, y) position in the tiled buffer.
+        for y in 0..h {
+            for x in 0..w {
+                let off = tiled_2d_offset(x, y, pitch, 2) as usize;
+                tiled[off + 0] = x as u8;
+                tiled[off + 1] = y as u8;
+                tiled[off + 2] = (x ^ y) as u8;
+                tiled[off + 3] = 0xFF;
+            }
+        }
+        let mut linear = vec![0u8; (w * h * bpp) as usize];
+        detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
+        // Verify every logical pixel landed at the right linear offset.
+        for y in 0..h {
+            for x in 0..w {
+                let lin = ((y * w + x) * bpp) as usize;
+                assert_eq!(linear[lin + 0], x as u8);
+                assert_eq!(linear[lin + 1], y as u8);
+                assert_eq!(linear[lin + 2], (x ^ y) as u8);
+                assert_eq!(linear[lin + 3], 0xFF);
+            }
+        }
+    }
+
+    /// Within a single macro-tile row, stepping `x` by 1 changes the low
+    /// 3 bits of `x` which feed the `inner_blocks` field — different
+    /// offsets are expected (no aliasing).
+    #[test]
+    fn neighbouring_pixels_have_distinct_offsets() {
+        let mut seen = std::collections::HashSet::new();
+        for y in 0..16 {
+            for x in 0..32 {
+                assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
+            }
+        }
+    }
+
+    /// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
+    /// rounds to 1312.
+    #[test]
+    fn align_pitch_rounds_up_to_32() {
+        assert_eq!(align_pitch_to_macro_tile(1280), 1280);
+        assert_eq!(align_pitch_to_macro_tile(1281), 1312);
+        assert_eq!(align_pitch_to_macro_tile(31), 32);
+    }
+}