Files
xenia-rs/crates/xenia-gpu/src/edram.rs
MechaCat02 79eb52c378 xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)
First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).

Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.

Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:29:38 +02:00

507 lines
18 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! CPU-side shadow of the Xenos GPU's 10 MiB EDRAM.
//!
//! The real console has 10 MiB of embedded DRAM organised as 2048 tiles,
//! each 80 × 16 samples wide at 32 bits per sample (`xenos.h:223-285`,
//! `kEdramTileCount = 2048`). 64-bpp formats pack two adjacent EDRAM tiles
//! per color value.
//!
//! xenia-rs does not currently render through a real EDRAM (host draws go
//! straight to wgpu attachments), but the resolve path still needs a
//! concrete byte source. We keep a linear 10 MiB `Vec<u8>` here so:
//!
//! * clear-resolves can paint `RB_COLOR_CLEAR` / `RB_DEPTH_CLEAR` into the
//! source tiles, which the resolve loop then copies into guest memory
//! (this is the Sylpheed-first-pixels path);
//! * future host→EDRAM readback code has a place to deposit pixels without
//! touching the resolve API.
//!
//! Byte layout inside one tile: row-major, `80 * 16 * bpp` bytes. At 32bpp,
//! offset `= y * 80 * 4 + x * 4` from the tile base. Samples are stored in
//! native-u32 byte order; any Xenon big-endian vs little-endian shuffling
//! happens at the resolve write boundary, not inside EDRAM.
//!
//! Indexing wraps mod 2048 (`XE_GPU_REGISTER` `RB_COLOR_INFO.color_base` is
//! 11-bit). Canary relies on this wraparound for tall surfaces that
//! exceed the 10 MiB region.
/// Number of tiles in EDRAM. `xenos::kEdramTileCount`.
pub const EDRAM_TILE_COUNT: u32 = 2048;
/// Samples per tile along X. `xenos::kEdramTileWidthSamples`.
pub const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80;
/// Samples per tile along Y. `xenos::kEdramTileHeightSamples`.
pub const EDRAM_TILE_HEIGHT_SAMPLES: u32 = 16;
/// Bytes per tile at 32bpp: 80 × 16 × 4 = 5120.
pub const EDRAM_TILE_BYTES_32BPP: u32 =
EDRAM_TILE_WIDTH_SAMPLES * EDRAM_TILE_HEIGHT_SAMPLES * 4;
/// Bytes per tile at 64bpp: 80 × 16 × 8 = 10_240 (two adjacent 32bpp tiles).
pub const EDRAM_TILE_BYTES_64BPP: u32 = EDRAM_TILE_BYTES_32BPP * 2;
/// Total EDRAM size in bytes: 2048 × 5120 = 10_485_760 (exactly 10 MiB).
pub const EDRAM_SIZE_BYTES: usize = (EDRAM_TILE_COUNT * EDRAM_TILE_BYTES_32BPP) as usize;
/// 10 MiB shadow of the console's EDRAM. Owned by `GpuSystem` and lives for
/// the lifetime of the GPU; no per-frame allocation.
pub struct ShadowEdram {
bytes: Vec<u8>,
}
impl Default for ShadowEdram {
fn default() -> Self {
Self::new()
}
}
impl ShadowEdram {
pub fn new() -> Self {
Self {
bytes: vec![0u8; EDRAM_SIZE_BYTES],
}
}
/// Raw byte offset of a tile within the shadow buffer, wrapped mod 2048.
#[inline]
fn tile_byte_offset(tile_index: u32) -> usize {
((tile_index % EDRAM_TILE_COUNT) * EDRAM_TILE_BYTES_32BPP) as usize
}
pub fn as_bytes(&self) -> &[u8] {
&self.bytes
}
pub fn tile(&self, tile_index: u32) -> &[u8] {
let off = Self::tile_byte_offset(tile_index);
&self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
}
pub fn tile_mut(&mut self, tile_index: u32) -> &mut [u8] {
let off = Self::tile_byte_offset(tile_index);
&mut self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
}
/// Sample-space byte offset within the shadow buffer for one 32bpp
/// sample at `(x_samples, y_samples)` in a surface whose EDRAM origin
/// is `base_tiles` and whose row pitch is `pitch_tiles` 32bpp tiles.
///
/// Tile layout: a surface of pitch `P` tiles is laid out as a row of
/// `P` tiles followed by the next 16-sample-tall row, etc. Sample
/// `(x, y)` lives in tile `(y/16)*P + (x/80)`, at row `y % 16` and
/// column `x % 80` within that tile.
#[inline]
fn sample_offset_32bpp(base_tiles: u16, pitch_tiles: u32, x: u32, y: u32) -> Option<usize> {
if pitch_tiles == 0 {
return None;
}
let tile_row = y / EDRAM_TILE_HEIGHT_SAMPLES;
let tile_col = x / EDRAM_TILE_WIDTH_SAMPLES;
let within_y = y % EDRAM_TILE_HEIGHT_SAMPLES;
let within_x = x % EDRAM_TILE_WIDTH_SAMPLES;
let tile_index =
(base_tiles as u32).wrapping_add(tile_row * pitch_tiles + tile_col);
let off = Self::tile_byte_offset(tile_index)
+ (within_y * EDRAM_TILE_WIDTH_SAMPLES * 4 + within_x * 4) as usize;
Some(off)
}
/// Fill a `(w × h)`-sample rectangle at `(x, y)` with a constant 32bpp
/// pattern. Coordinates are in *sample space* (already scaled through
/// `sample_count_log2_x/y` for MSAA). Wraps mod 2048 tiles via
/// `tile_byte_offset`.
///
/// The pattern is written as host-native little-endian bytes — the
/// endian swap in [`crate::resolve::apply_endian_128`] converts to the
/// byte order expected by the destination.
#[allow(clippy::too_many_arguments)]
pub fn fill_rect_32bpp(
&mut self,
base_tiles: u16,
pitch_tiles: u32,
x: u32,
y: u32,
w: u32,
h: u32,
pattern: u32,
) {
if w == 0 || h == 0 {
return;
}
let le = pattern.to_le_bytes();
for dy in 0..h {
for dx in 0..w {
if let Some(off) = Self::sample_offset_32bpp(
base_tiles,
pitch_tiles,
x + dx,
y + dy,
) && off + 4 <= self.bytes.len()
{
self.bytes[off..off + 4].copy_from_slice(&le);
}
}
}
}
/// Read one 32bpp sample at `(x, y)` in sample coordinates. Returns 0
/// if the surface pitch is zero (degenerate; caller should skip the
/// resolve).
pub fn read_sample_32bpp(
&self,
base_tiles: u16,
pitch_tiles: u32,
x: u32,
y: u32,
) -> u32 {
match Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y) {
Some(off) if off + 4 <= self.bytes.len() => u32::from_le_bytes([
self.bytes[off],
self.bytes[off + 1],
self.bytes[off + 2],
self.bytes[off + 3],
]),
_ => 0,
}
}
/// Write one 32bpp sample at `(x, y)` in sample coordinates. Mirror of
/// [`Self::read_sample_32bpp`]. Used by the wgpu→ShadowEdram readback
/// retile path and unit tests.
pub fn write_sample_32bpp(
&mut self,
base_tiles: u16,
pitch_tiles: u32,
x: u32,
y: u32,
sample: u32,
) {
if let Some(off) = Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y)
&& off + 4 <= self.bytes.len()
{
self.bytes[off..off + 4].copy_from_slice(&sample.to_le_bytes());
}
}
/// Bulk write a `(w × h)`-sample rectangle at `(x, y)` from a row-major
/// linear `samples` buffer. The buffer length must be at least `w * h`;
/// extra entries are ignored. Order: `samples[dy * w + dx]` lands at
/// (x + dx, y + dy). This is the format the wgpu→ShadowEdram readback
/// path uses after stripping wgpu's 256-byte row alignment.
#[allow(clippy::too_many_arguments)]
pub fn write_rect_32bpp(
&mut self,
base_tiles: u16,
pitch_tiles: u32,
x: u32,
y: u32,
w: u32,
h: u32,
samples: &[u32],
) {
if w == 0 || h == 0 {
return;
}
let needed = (w as usize).saturating_mul(h as usize);
debug_assert!(samples.len() >= needed, "write_rect_32bpp: samples too short");
for dy in 0..h {
let row_base = (dy as usize) * (w as usize);
for dx in 0..w {
let idx = row_base + dx as usize;
if idx >= samples.len() {
return;
}
self.write_sample_32bpp(base_tiles, pitch_tiles, x + dx, y + dy, samples[idx]);
}
}
}
// --- 64bpp helpers ----------------------------------------------------
//
// 64bpp formats (`k_16_16_16_16`, `k_16_16_16_16_FLOAT`, `k_32_32_FLOAT`)
// occupy two adjacent EDRAM tiles per logical tile, doubling the row
// pitch in tiles. Per Canary `xenos.h:321-325 IsColorRenderTargetFormat64bpp`
// and `draw_util.cc:1260-1262` (`pitch_tiles = surface_pitch_tiles << is_64bpp`).
//
// Convention: callers pass the *32bpp-equivalent* `base_tiles` and
// `pitch_tiles_32bpp` (i.e. the `RB_COLOR_INFO.color_base` and
// `surface_pitch_tiles` decoded from registers). The 64bpp helpers
// multiply both by 2 internally so the lo/hi pair lands in adjacent
// tiles. `lo` is the lower-addressed 32bpp word; `hi` is the upper.
/// Read one 64bpp sample as `(lo, hi)` u32 pair. Doubled-tile addressing
/// per Canary's `is_64bpp` convention.
pub fn read_sample_64bpp(
&self,
base_tiles: u16,
pitch_tiles_32bpp: u32,
x: u32,
y: u32,
) -> (u32, u32) {
let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
let base64 = (base_tiles as u32).saturating_mul(2) as u16;
let lo = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2), y);
let hi = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y);
(lo, hi)
}
/// Write one 64bpp sample as `(lo, hi)` u32 pair.
pub fn write_sample_64bpp(
&mut self,
base_tiles: u16,
pitch_tiles_32bpp: u32,
x: u32,
y: u32,
lo: u32,
hi: u32,
) {
let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
let base64 = (base_tiles as u32).saturating_mul(2) as u16;
self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2), y, lo);
self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y, hi);
}
/// Bulk write a 64bpp rectangle from a row-major `(lo, hi)` linear
/// buffer.
#[allow(clippy::too_many_arguments)]
pub fn write_rect_64bpp(
&mut self,
base_tiles: u16,
pitch_tiles_32bpp: u32,
x: u32,
y: u32,
w: u32,
h: u32,
samples: &[(u32, u32)],
) {
if w == 0 || h == 0 {
return;
}
for dy in 0..h {
let row_base = (dy as usize) * (w as usize);
for dx in 0..w {
let idx = row_base + dx as usize;
if idx >= samples.len() {
return;
}
let (lo, hi) = samples[idx];
self.write_sample_64bpp(base_tiles, pitch_tiles_32bpp, x + dx, y + dy, lo, hi);
}
}
}
/// Fill a `(w × h)`-sample rectangle with a constant 64bpp pattern.
/// `lo` lands at the low-addressed 32bpp word, `hi` at the high one
/// — i.e. for clears, callers pass `(lo = RB_COLOR_CLEAR_LO,
/// hi = RB_COLOR_CLEAR)` per Canary `draw_util.cc:1302-1303`.
#[allow(clippy::too_many_arguments)]
pub fn fill_rect_64bpp(
&mut self,
base_tiles: u16,
pitch_tiles_32bpp: u32,
x: u32,
y: u32,
w: u32,
h: u32,
lo: u32,
hi: u32,
) {
if w == 0 || h == 0 {
return;
}
for dy in 0..h {
for dx in 0..w {
self.write_sample_64bpp(
base_tiles,
pitch_tiles_32bpp,
x + dx,
y + dy,
lo,
hi,
);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn shadow_edram_is_exactly_10_mib() {
assert_eq!(EDRAM_SIZE_BYTES, 10 * 1024 * 1024);
let e = ShadowEdram::new();
assert_eq!(e.as_bytes().len(), 10 * 1024 * 1024);
}
#[test]
fn fill_rect_writes_the_whole_first_tile() {
let mut e = ShadowEdram::new();
e.fill_rect_32bpp(0, 1, 0, 0, 80, 16, 0x11223344);
// Every 4-byte sample in tile 0 should be 0x11223344 (LE).
let expected = 0x11223344u32.to_le_bytes();
let tile = e.tile(0);
for chunk in tile.chunks_exact(4) {
assert_eq!(chunk, expected);
}
}
#[test]
fn fill_rect_respects_pitch_and_base() {
let mut e = ShadowEdram::new();
// Surface: pitch=2 tiles, base=5. A 160x16 fill should land in
// tiles 5 and 6 — and leave tile 4 / tile 7 / tile 0 untouched.
e.fill_rect_32bpp(5, 2, 0, 0, 160, 16, 0xAABBCCDD);
let expected = 0xAABBCCDDu32.to_le_bytes();
for chunk in e.tile(5).chunks_exact(4) {
assert_eq!(chunk, expected);
}
for chunk in e.tile(6).chunks_exact(4) {
assert_eq!(chunk, expected);
}
assert!(e.tile(4).iter().all(|&b| b == 0));
assert!(e.tile(7).iter().all(|&b| b == 0));
assert!(e.tile(0).iter().all(|&b| b == 0));
}
#[test]
fn fill_rect_wraps_mod_2048() {
let mut e = ShadowEdram::new();
// base=2047, pitch=2: first tile is 2047, second wraps to 0.
e.fill_rect_32bpp(2047, 2, 0, 0, 160, 16, 0xDEAD_BEEF);
let expected = 0xDEAD_BEEFu32.to_le_bytes();
for chunk in e.tile(2047).chunks_exact(4) {
assert_eq!(chunk, expected);
}
for chunk in e.tile(0).chunks_exact(4) {
assert_eq!(chunk, expected);
}
}
#[test]
fn read_sample_roundtrips_fill_rect() {
let mut e = ShadowEdram::new();
e.fill_rect_32bpp(3, 1, 0, 0, 80, 16, 0xCAFE_F00D);
// Sample any interior point.
assert_eq!(e.read_sample_32bpp(3, 1, 0, 0), 0xCAFE_F00D);
assert_eq!(e.read_sample_32bpp(3, 1, 79, 15), 0xCAFE_F00D);
// Untouched neighbouring tile.
assert_eq!(e.read_sample_32bpp(4, 1, 0, 0), 0);
}
#[test]
fn zero_pitch_is_a_noop_read() {
let e = ShadowEdram::new();
assert_eq!(e.read_sample_32bpp(0, 0, 10, 10), 0);
}
/// `write_sample_32bpp` round-trips through `read_sample_32bpp`.
#[test]
fn write_sample_32bpp_round_trips() {
let mut e = ShadowEdram::new();
for x in 0..80u32 {
for y in 0..16u32 {
e.write_sample_32bpp(0, 1, x, y, 0xABCD_0000 | (y << 8) | x);
}
}
for x in 0..80u32 {
for y in 0..16u32 {
assert_eq!(
e.read_sample_32bpp(0, 1, x, y),
0xABCD_0000 | (y << 8) | x,
"round-trip mismatch at ({x},{y})"
);
}
}
}
/// `write_rect_32bpp` writes row-major samples into the right
/// sample-offsets, including across tile boundaries.
#[test]
fn write_rect_32bpp_crosses_tile_boundary() {
let mut e = ShadowEdram::new();
// Surface pitch = 2 tiles → x in [0, 160), y in [0, 16). A 100x4
// rect at (40, 4) crosses x=80 (tile boundary).
let w = 100u32;
let h = 4u32;
let mut samples = Vec::with_capacity((w * h) as usize);
for dy in 0..h {
for dx in 0..w {
samples.push(0x10000 | (dy << 8) | dx);
}
}
e.write_rect_32bpp(0, 2, 40, 4, w, h, &samples);
// Spot-check: (40, 4) lands in tile 0; (140, 4) in tile 1.
assert_eq!(e.read_sample_32bpp(0, 2, 40, 4), 0x1_0000);
assert_eq!(
e.read_sample_32bpp(0, 2, 139, 7),
0x10000 | (3 << 8) | 99
);
}
/// `read_sample_64bpp` round-trips through `write_sample_64bpp` —
/// doubled-pitch addressing keeps lo/hi adjacent in EDRAM bytes.
#[test]
fn write_read_sample_64bpp_roundtrips() {
let mut e = ShadowEdram::new();
// Use 32bpp pitch=1, base=0 → 64bpp pitch=2, base=0. A single-tile
// 64bpp surface fits 80x16 logical 64bpp samples? No — 80x16 32bpp
// samples per tile, 80 logical 64bpp samples per *pair* of tiles,
// and our 80×16 region needs 2 tiles. Stick to 16x4 logical 64bpp.
for x in 0..16u32 {
for y in 0..4u32 {
e.write_sample_64bpp(0, 1, x, y, 0xAAAA_0000 | x, 0xBBBB_0000 | y);
}
}
for x in 0..16u32 {
for y in 0..4u32 {
let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
assert_eq!(lo, 0xAAAA_0000 | x);
assert_eq!(hi, 0xBBBB_0000 | y);
}
}
}
/// `fill_rect_64bpp` writes both the lo and hi clear words across
/// a 64bpp surface — matches the `RB_COLOR_CLEAR_LO`/`RB_COLOR_CLEAR`
/// convention.
#[test]
fn fill_rect_64bpp_writes_both_words() {
let mut e = ShadowEdram::new();
// 16x4 logical 64bpp samples; pitch=1 32bpp tile → 2 64bpp tiles.
e.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xCAFE_F00D, 0xDEAD_BEEF);
for x in 0..16u32 {
for y in 0..4u32 {
let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
assert_eq!(lo, 0xCAFE_F00D);
assert_eq!(hi, 0xDEAD_BEEF);
}
}
}
/// 64bpp helpers must respect the doubled tile pitch — adjacent logical
/// 64bpp samples must land at adjacent 32bpp samples in EDRAM.
#[test]
fn sixty_four_bpp_uses_doubled_pitch() {
let mut e = ShadowEdram::new();
e.write_sample_64bpp(0, 1, 5, 0, 0x1111_1111, 0x2222_2222);
// The lo word must sit at 32bpp x=10 (5 << 1), hi at x=11.
// Doubled pitch -> base=0, pitch=2 32bpp.
assert_eq!(e.read_sample_32bpp(0, 2, 10, 0), 0x1111_1111);
assert_eq!(e.read_sample_32bpp(0, 2, 11, 0), 0x2222_2222);
}
/// `write_rect_*` with empty dimensions is a no-op.
#[test]
fn write_rect_empty_is_noop() {
let mut e = ShadowEdram::new();
e.write_rect_32bpp(0, 1, 0, 0, 0, 5, &[1, 2, 3]);
e.write_rect_32bpp(0, 1, 0, 0, 5, 0, &[1, 2, 3]);
e.fill_rect_64bpp(0, 1, 0, 0, 0, 5, 1, 2);
e.fill_rect_64bpp(0, 1, 0, 0, 5, 0, 1, 2);
// Nothing should have been written.
assert!(e.as_bytes().iter().all(|&b| b == 0));
}
}