First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).
Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.
Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
179 lines
6.8 KiB
Rust
179 lines
6.8 KiB
Rust
//! Xenos tiled-texture address formula (2D, Tiled2D layout).
|
||
//!
|
||
//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
|
||
//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
|
||
//! macro-tile pattern with bank+pipe interleave for its internal DRAM
|
||
//! banks; this formula inverts that so we can read pixels out in linear
|
||
//! order, given the tiled source buffer.
|
||
//!
|
||
//! We use this in two places during P4:
|
||
//! - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
|
||
//! - Any place we need to read tiled guest memory into a host-linear
|
||
//! buffer for CPU-side conversion before upload.
|
||
|
||
/// Tile size constants from canary.
|
||
pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
|
||
pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px
|
||
|
||
/// Canary's `TiledCombine` helper — reassembles the DRAM address from the
|
||
/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
|
||
#[inline]
|
||
fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
|
||
(y_lsb << 4)
|
||
| (pipe << 6)
|
||
| (bank << 11)
|
||
| (outer_inner_bytes & 0b1111)
|
||
| (((outer_inner_bytes >> 4) & 0b1) << 5)
|
||
| (((outer_inner_bytes >> 5) & 0b111) << 8)
|
||
| ((outer_inner_bytes >> 8) << 12)
|
||
}
|
||
|
||
/// 2D tiled offset in bytes from (x, y) into a tiled surface with
|
||
/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
|
||
/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
|
||
/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
|
||
/// `u32` rather than canary's signed `int` since our callers stay in
|
||
/// unsigned arithmetic.
|
||
///
|
||
/// This is the canonical formula — do not simplify without re-reading
|
||
/// `texture_address.h:190-208`; the bit-interleave cannot be expressed
|
||
/// as a linear function.
|
||
pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
|
||
let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
|
||
// Outer: which 32×32 macro tile we're in.
|
||
let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
|
||
+ (x >> MACRO_TILE_WIDTH_LOG2))
|
||
<< 6;
|
||
// Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
|
||
// because that bit becomes the `y_lsb` interleave bit below).
|
||
let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
|
||
let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;
|
||
|
||
let bank = (y >> 4) & 0b1;
|
||
let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
|
||
let y_lsb = y & 1;
|
||
|
||
tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
|
||
}
|
||
|
||
/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
|
||
/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
|
||
#[inline]
|
||
pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
|
||
let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
|
||
(pitch_pixels + mask) & !mask
|
||
}
|
||
|
||
/// Detile a 2D tiled surface into a linear destination buffer. The
|
||
/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
|
||
/// one block in the tiled source, and the detiler writes it into `dst`
|
||
/// at the linear (x, y) position.
|
||
///
|
||
/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
|
||
/// a single 16-bit block, etc.). `dst` must be at least
|
||
/// `width * height * bpp` bytes long.
|
||
///
|
||
/// Returns `Err(())` if the source doesn't contain enough bytes for the
|
||
/// largest offset the formula would produce (defensive — callers can
|
||
/// downgrade silently).
|
||
pub fn detile_2d(
|
||
src: &[u8],
|
||
dst: &mut [u8],
|
||
width: u32,
|
||
height: u32,
|
||
pitch_pixels: u32,
|
||
bpp: u32,
|
||
) -> Result<(), ()> {
|
||
let bpp_log2 = bpp.trailing_zeros();
|
||
let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
|
||
let dst_pitch_bytes = (width * bpp) as usize;
|
||
let bpp_u = bpp as usize;
|
||
|
||
for y in 0..height {
|
||
for x in 0..width {
|
||
let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
|
||
if src_off + bpp_u > src.len() {
|
||
return Err(());
|
||
}
|
||
let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
|
||
if dst_off + bpp_u > dst.len() {
|
||
return Err(());
|
||
}
|
||
dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
|
||
}
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
/// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
|
||
#[test]
|
||
fn origin_is_zero() {
|
||
assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
|
||
}
|
||
|
||
/// Round-trip: detiling a tiled buffer that was filled using the same
|
||
/// formula produces the identity linear image.
|
||
#[test]
|
||
fn roundtrip_small_pattern() {
|
||
let w = 32u32;
|
||
let h = 16u32;
|
||
let bpp = 4u32;
|
||
let pitch = align_pitch_to_macro_tile(w);
|
||
// Allocate a tiled buffer large enough for the largest offset.
|
||
let max_off = (0..h)
|
||
.flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
|
||
.max()
|
||
.unwrap();
|
||
let mut tiled = vec![0u8; max_off];
|
||
// Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
|
||
// each logical (x, y) position in the tiled buffer.
|
||
for y in 0..h {
|
||
for x in 0..w {
|
||
let off = tiled_2d_offset(x, y, pitch, 2) as usize;
|
||
tiled[off + 0] = x as u8;
|
||
tiled[off + 1] = y as u8;
|
||
tiled[off + 2] = (x ^ y) as u8;
|
||
tiled[off + 3] = 0xFF;
|
||
}
|
||
}
|
||
let mut linear = vec![0u8; (w * h * bpp) as usize];
|
||
detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
|
||
// Verify every logical pixel landed at the right linear offset.
|
||
for y in 0..h {
|
||
for x in 0..w {
|
||
let lin = ((y * w + x) * bpp) as usize;
|
||
assert_eq!(linear[lin + 0], x as u8);
|
||
assert_eq!(linear[lin + 1], y as u8);
|
||
assert_eq!(linear[lin + 2], (x ^ y) as u8);
|
||
assert_eq!(linear[lin + 3], 0xFF);
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Within a single macro-tile row, stepping `x` by 1 changes the low
|
||
/// 3 bits of `x` which feed the `inner_blocks` field — different
|
||
/// offsets are expected (no aliasing).
|
||
#[test]
|
||
fn neighbouring_pixels_have_distinct_offsets() {
|
||
let mut seen = std::collections::HashSet::new();
|
||
for y in 0..16 {
|
||
for x in 0..32 {
|
||
assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
|
||
/// rounds to 1312.
|
||
#[test]
|
||
fn align_pitch_rounds_up_to_32() {
|
||
assert_eq!(align_pitch_to_macro_tile(1280), 1280);
|
||
assert_eq!(align_pitch_to_macro_tile(1281), 1312);
|
||
assert_eq!(align_pitch_to_macro_tile(31), 32);
|
||
}
|
||
}
|