xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).

Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.

Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions

View File

@@ -0,0 +1,178 @@
//! Xenos tiled-texture address formula (2D, Tiled2D layout).
//!
//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
//! macro-tile pattern with bank+pipe interleave for its internal DRAM
//! banks; this formula inverts that so we can read pixels out in linear
//! order, given the tiled source buffer.
//!
//! We use this in two places during P4:
//! - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
//! - Any place we need to read tiled guest memory into a host-linear
//! buffer for CPU-side conversion before upload.
/// Tile size constants from canary.
pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px
/// Canary's `TiledCombine` helper — reassembles the DRAM address from the
/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
#[inline]
fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
(y_lsb << 4)
| (pipe << 6)
| (bank << 11)
| (outer_inner_bytes & 0b1111)
| (((outer_inner_bytes >> 4) & 0b1) << 5)
| (((outer_inner_bytes >> 5) & 0b111) << 8)
| ((outer_inner_bytes >> 8) << 12)
}
/// 2D tiled offset in bytes from (x, y) into a tiled surface with
/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
/// `u32` rather than canary's signed `int` since our callers stay in
/// unsigned arithmetic.
///
/// This is the canonical formula — do not simplify without re-reading
/// `texture_address.h:190-208`; the bit-interleave cannot be expressed
/// as a linear function.
pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
// Outer: which 32×32 macro tile we're in.
let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
+ (x >> MACRO_TILE_WIDTH_LOG2))
<< 6;
// Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
// because that bit becomes the `y_lsb` interleave bit below).
let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;
let bank = (y >> 4) & 0b1;
let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
let y_lsb = y & 1;
tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
}
/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
#[inline]
pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
(pitch_pixels + mask) & !mask
}
/// Detile a 2D tiled surface into a linear destination buffer. The
/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
/// one block in the tiled source, and the detiler writes it into `dst`
/// at the linear (x, y) position.
///
/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
/// a single 16-bit block, etc.). `dst` must be at least
/// `width * height * bpp` bytes long.
///
/// Returns `Err(())` if the source doesn't contain enough bytes for the
/// largest offset the formula would produce (defensive — callers can
/// downgrade silently).
pub fn detile_2d(
src: &[u8],
dst: &mut [u8],
width: u32,
height: u32,
pitch_pixels: u32,
bpp: u32,
) -> Result<(), ()> {
let bpp_log2 = bpp.trailing_zeros();
let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
let dst_pitch_bytes = (width * bpp) as usize;
let bpp_u = bpp as usize;
for y in 0..height {
for x in 0..width {
let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
if src_off + bpp_u > src.len() {
return Err(());
}
let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
if dst_off + bpp_u > dst.len() {
return Err(());
}
dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
/// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
#[test]
fn origin_is_zero() {
assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
}
/// Round-trip: detiling a tiled buffer that was filled using the same
/// formula produces the identity linear image.
#[test]
fn roundtrip_small_pattern() {
let w = 32u32;
let h = 16u32;
let bpp = 4u32;
let pitch = align_pitch_to_macro_tile(w);
// Allocate a tiled buffer large enough for the largest offset.
let max_off = (0..h)
.flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
.max()
.unwrap();
let mut tiled = vec![0u8; max_off];
// Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
// each logical (x, y) position in the tiled buffer.
for y in 0..h {
for x in 0..w {
let off = tiled_2d_offset(x, y, pitch, 2) as usize;
tiled[off + 0] = x as u8;
tiled[off + 1] = y as u8;
tiled[off + 2] = (x ^ y) as u8;
tiled[off + 3] = 0xFF;
}
}
let mut linear = vec![0u8; (w * h * bpp) as usize];
detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
// Verify every logical pixel landed at the right linear offset.
for y in 0..h {
for x in 0..w {
let lin = ((y * w + x) * bpp) as usize;
assert_eq!(linear[lin + 0], x as u8);
assert_eq!(linear[lin + 1], y as u8);
assert_eq!(linear[lin + 2], (x ^ y) as u8);
assert_eq!(linear[lin + 3], 0xFF);
}
}
}
/// Within a single macro-tile row, stepping `x` by 1 changes the low
/// 3 bits of `x` which feed the `inner_blocks` field — different
/// offsets are expected (no aliasing).
#[test]
fn neighbouring_pixels_have_distinct_offsets() {
let mut seen = std::collections::HashSet::new();
for y in 0..16 {
for x in 0..32 {
assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
}
}
}
/// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
/// rounds to 1312.
#[test]
fn align_pitch_rounds_up_to_32() {
assert_eq!(align_pitch_to_macro_tile(1280), 1280);
assert_eq!(align_pitch_to_macro_tile(1281), 1312);
assert_eq!(align_pitch_to_macro_tile(31), 32);
}
}