xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)
First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).
Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.
Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -11,3 +11,11 @@ tracing = { workspace = true }
|
|||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
byteorder = { workspace = true }
|
byteorder = { workspace = true }
|
||||||
|
metrics = { workspace = true }
|
||||||
|
bytemuck = { workspace = true }
|
||||||
|
crossbeam-channel = { workspace = true }
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
# Used to validate bundled WGSL placeholders compile cleanly. Matches the
|
||||||
|
# wgpu-22 transitive dep so we don't pull in a second naga version.
|
||||||
|
naga = { version = "22", features = ["wgsl-in"] }
|
||||||
|
|||||||
1113
crates/xenia-gpu/src/draw_state.rs
Normal file
1113
crates/xenia-gpu/src/draw_state.rs
Normal file
File diff suppressed because it is too large
Load Diff
506
crates/xenia-gpu/src/edram.rs
Normal file
506
crates/xenia-gpu/src/edram.rs
Normal file
@@ -0,0 +1,506 @@
|
|||||||
|
//! CPU-side shadow of the Xenos GPU's 10 MiB EDRAM.
|
||||||
|
//!
|
||||||
|
//! The real console has 10 MiB of embedded DRAM organised as 2048 tiles,
|
||||||
|
//! each 80 × 16 samples wide at 32 bits per sample (`xenos.h:223-285`,
|
||||||
|
//! `kEdramTileCount = 2048`). 64-bpp formats pack two adjacent EDRAM tiles
|
||||||
|
//! per color value.
|
||||||
|
//!
|
||||||
|
//! xenia-rs does not currently render through a real EDRAM (host draws go
|
||||||
|
//! straight to wgpu attachments), but the resolve path still needs a
|
||||||
|
//! concrete byte source. We keep a linear 10 MiB `Vec<u8>` here so:
|
||||||
|
//!
|
||||||
|
//! * clear-resolves can paint `RB_COLOR_CLEAR` / `RB_DEPTH_CLEAR` into the
|
||||||
|
//! source tiles, which the resolve loop then copies into guest memory
|
||||||
|
//! (this is the Sylpheed-first-pixels path);
|
||||||
|
//! * future host→EDRAM readback code has a place to deposit pixels without
|
||||||
|
//! touching the resolve API.
|
||||||
|
//!
|
||||||
|
//! Byte layout inside one tile: row-major, `80 * 16 * bpp` bytes. At 32bpp,
|
||||||
|
//! offset `= y * 80 * 4 + x * 4` from the tile base. Samples are stored in
|
||||||
|
//! native-u32 byte order; any Xenon big-endian vs little-endian shuffling
|
||||||
|
//! happens at the resolve write boundary, not inside EDRAM.
|
||||||
|
//!
|
||||||
|
//! Indexing wraps mod 2048 (`XE_GPU_REGISTER` `RB_COLOR_INFO.color_base` is
|
||||||
|
//! 11-bit). Canary relies on this wraparound for tall surfaces that
|
||||||
|
//! exceed the 10 MiB region.
|
||||||
|
|
||||||
|
/// Number of tiles in EDRAM. `xenos::kEdramTileCount`.
|
||||||
|
pub const EDRAM_TILE_COUNT: u32 = 2048;
|
||||||
|
|
||||||
|
/// Samples per tile along X. `xenos::kEdramTileWidthSamples`.
|
||||||
|
pub const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80;
|
||||||
|
|
||||||
|
/// Samples per tile along Y. `xenos::kEdramTileHeightSamples`.
|
||||||
|
pub const EDRAM_TILE_HEIGHT_SAMPLES: u32 = 16;
|
||||||
|
|
||||||
|
/// Bytes per tile at 32bpp: 80 × 16 × 4 = 5120.
|
||||||
|
pub const EDRAM_TILE_BYTES_32BPP: u32 =
|
||||||
|
EDRAM_TILE_WIDTH_SAMPLES * EDRAM_TILE_HEIGHT_SAMPLES * 4;
|
||||||
|
|
||||||
|
/// Bytes per tile at 64bpp: 80 × 16 × 8 = 10_240 (two adjacent 32bpp tiles).
|
||||||
|
pub const EDRAM_TILE_BYTES_64BPP: u32 = EDRAM_TILE_BYTES_32BPP * 2;
|
||||||
|
|
||||||
|
/// Total EDRAM size in bytes: 2048 × 5120 = 10_485_760 (exactly 10 MiB).
|
||||||
|
pub const EDRAM_SIZE_BYTES: usize = (EDRAM_TILE_COUNT * EDRAM_TILE_BYTES_32BPP) as usize;
|
||||||
|
|
||||||
|
/// 10 MiB shadow of the console's EDRAM. Owned by `GpuSystem` and lives for
|
||||||
|
/// the lifetime of the GPU; no per-frame allocation.
|
||||||
|
pub struct ShadowEdram {
|
||||||
|
bytes: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ShadowEdram {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ShadowEdram {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
bytes: vec![0u8; EDRAM_SIZE_BYTES],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Raw byte offset of a tile within the shadow buffer, wrapped mod 2048.
|
||||||
|
#[inline]
|
||||||
|
fn tile_byte_offset(tile_index: u32) -> usize {
|
||||||
|
((tile_index % EDRAM_TILE_COUNT) * EDRAM_TILE_BYTES_32BPP) as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_bytes(&self) -> &[u8] {
|
||||||
|
&self.bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn tile(&self, tile_index: u32) -> &[u8] {
|
||||||
|
let off = Self::tile_byte_offset(tile_index);
|
||||||
|
&self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn tile_mut(&mut self, tile_index: u32) -> &mut [u8] {
|
||||||
|
let off = Self::tile_byte_offset(tile_index);
|
||||||
|
&mut self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sample-space byte offset within the shadow buffer for one 32bpp
|
||||||
|
/// sample at `(x_samples, y_samples)` in a surface whose EDRAM origin
|
||||||
|
/// is `base_tiles` and whose row pitch is `pitch_tiles` 32bpp tiles.
|
||||||
|
///
|
||||||
|
/// Tile layout: a surface of pitch `P` tiles is laid out as a row of
|
||||||
|
/// `P` tiles followed by the next 16-sample-tall row, etc. Sample
|
||||||
|
/// `(x, y)` lives in tile `(y/16)*P + (x/80)`, at row `y % 16` and
|
||||||
|
/// column `x % 80` within that tile.
|
||||||
|
#[inline]
|
||||||
|
fn sample_offset_32bpp(base_tiles: u16, pitch_tiles: u32, x: u32, y: u32) -> Option<usize> {
|
||||||
|
if pitch_tiles == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let tile_row = y / EDRAM_TILE_HEIGHT_SAMPLES;
|
||||||
|
let tile_col = x / EDRAM_TILE_WIDTH_SAMPLES;
|
||||||
|
let within_y = y % EDRAM_TILE_HEIGHT_SAMPLES;
|
||||||
|
let within_x = x % EDRAM_TILE_WIDTH_SAMPLES;
|
||||||
|
let tile_index =
|
||||||
|
(base_tiles as u32).wrapping_add(tile_row * pitch_tiles + tile_col);
|
||||||
|
let off = Self::tile_byte_offset(tile_index)
|
||||||
|
+ (within_y * EDRAM_TILE_WIDTH_SAMPLES * 4 + within_x * 4) as usize;
|
||||||
|
Some(off)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fill a `(w × h)`-sample rectangle at `(x, y)` with a constant 32bpp
|
||||||
|
/// pattern. Coordinates are in *sample space* (already scaled through
|
||||||
|
/// `sample_count_log2_x/y` for MSAA). Wraps mod 2048 tiles via
|
||||||
|
/// `tile_byte_offset`.
|
||||||
|
///
|
||||||
|
/// The pattern is written as host-native little-endian bytes — the
|
||||||
|
/// endian swap in [`crate::resolve::apply_endian_128`] converts to the
|
||||||
|
/// byte order expected by the destination.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn fill_rect_32bpp(
|
||||||
|
&mut self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
w: u32,
|
||||||
|
h: u32,
|
||||||
|
pattern: u32,
|
||||||
|
) {
|
||||||
|
if w == 0 || h == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let le = pattern.to_le_bytes();
|
||||||
|
for dy in 0..h {
|
||||||
|
for dx in 0..w {
|
||||||
|
if let Some(off) = Self::sample_offset_32bpp(
|
||||||
|
base_tiles,
|
||||||
|
pitch_tiles,
|
||||||
|
x + dx,
|
||||||
|
y + dy,
|
||||||
|
) && off + 4 <= self.bytes.len()
|
||||||
|
{
|
||||||
|
self.bytes[off..off + 4].copy_from_slice(&le);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read one 32bpp sample at `(x, y)` in sample coordinates. Returns 0
|
||||||
|
/// if the surface pitch is zero (degenerate; caller should skip the
|
||||||
|
/// resolve).
|
||||||
|
pub fn read_sample_32bpp(
|
||||||
|
&self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
) -> u32 {
|
||||||
|
match Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y) {
|
||||||
|
Some(off) if off + 4 <= self.bytes.len() => u32::from_le_bytes([
|
||||||
|
self.bytes[off],
|
||||||
|
self.bytes[off + 1],
|
||||||
|
self.bytes[off + 2],
|
||||||
|
self.bytes[off + 3],
|
||||||
|
]),
|
||||||
|
_ => 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write one 32bpp sample at `(x, y)` in sample coordinates. Mirror of
|
||||||
|
/// [`Self::read_sample_32bpp`]. Used by the wgpu→ShadowEdram readback
|
||||||
|
/// retile path and unit tests.
|
||||||
|
pub fn write_sample_32bpp(
|
||||||
|
&mut self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
sample: u32,
|
||||||
|
) {
|
||||||
|
if let Some(off) = Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y)
|
||||||
|
&& off + 4 <= self.bytes.len()
|
||||||
|
{
|
||||||
|
self.bytes[off..off + 4].copy_from_slice(&sample.to_le_bytes());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bulk write a `(w × h)`-sample rectangle at `(x, y)` from a row-major
|
||||||
|
/// linear `samples` buffer. The buffer length must be at least `w * h`;
|
||||||
|
/// extra entries are ignored. Order: `samples[dy * w + dx]` lands at
|
||||||
|
/// (x + dx, y + dy). This is the format the wgpu→ShadowEdram readback
|
||||||
|
/// path uses after stripping wgpu's 256-byte row alignment.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn write_rect_32bpp(
|
||||||
|
&mut self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
w: u32,
|
||||||
|
h: u32,
|
||||||
|
samples: &[u32],
|
||||||
|
) {
|
||||||
|
if w == 0 || h == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let needed = (w as usize).saturating_mul(h as usize);
|
||||||
|
debug_assert!(samples.len() >= needed, "write_rect_32bpp: samples too short");
|
||||||
|
for dy in 0..h {
|
||||||
|
let row_base = (dy as usize) * (w as usize);
|
||||||
|
for dx in 0..w {
|
||||||
|
let idx = row_base + dx as usize;
|
||||||
|
if idx >= samples.len() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
self.write_sample_32bpp(base_tiles, pitch_tiles, x + dx, y + dy, samples[idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- 64bpp helpers ----------------------------------------------------
|
||||||
|
//
|
||||||
|
// 64bpp formats (`k_16_16_16_16`, `k_16_16_16_16_FLOAT`, `k_32_32_FLOAT`)
|
||||||
|
// occupy two adjacent EDRAM tiles per logical tile, doubling the row
|
||||||
|
// pitch in tiles. Per Canary `xenos.h:321-325 IsColorRenderTargetFormat64bpp`
|
||||||
|
// and `draw_util.cc:1260-1262` (`pitch_tiles = surface_pitch_tiles << is_64bpp`).
|
||||||
|
//
|
||||||
|
// Convention: callers pass the *32bpp-equivalent* `base_tiles` and
|
||||||
|
// `pitch_tiles_32bpp` (i.e. the `RB_COLOR_INFO.color_base` and
|
||||||
|
// `surface_pitch_tiles` decoded from registers). The 64bpp helpers
|
||||||
|
// multiply both by 2 internally so the lo/hi pair lands in adjacent
|
||||||
|
// tiles. `lo` is the lower-addressed 32bpp word; `hi` is the upper.
|
||||||
|
|
||||||
|
/// Read one 64bpp sample as `(lo, hi)` u32 pair. Doubled-tile addressing
|
||||||
|
/// per Canary's `is_64bpp` convention.
|
||||||
|
pub fn read_sample_64bpp(
|
||||||
|
&self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles_32bpp: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
) -> (u32, u32) {
|
||||||
|
let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
|
||||||
|
let base64 = (base_tiles as u32).saturating_mul(2) as u16;
|
||||||
|
let lo = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2), y);
|
||||||
|
let hi = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y);
|
||||||
|
(lo, hi)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write one 64bpp sample as `(lo, hi)` u32 pair.
|
||||||
|
pub fn write_sample_64bpp(
|
||||||
|
&mut self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles_32bpp: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
lo: u32,
|
||||||
|
hi: u32,
|
||||||
|
) {
|
||||||
|
let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
|
||||||
|
let base64 = (base_tiles as u32).saturating_mul(2) as u16;
|
||||||
|
self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2), y, lo);
|
||||||
|
self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y, hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bulk write a 64bpp rectangle from a row-major `(lo, hi)` linear
|
||||||
|
/// buffer.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn write_rect_64bpp(
|
||||||
|
&mut self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles_32bpp: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
w: u32,
|
||||||
|
h: u32,
|
||||||
|
samples: &[(u32, u32)],
|
||||||
|
) {
|
||||||
|
if w == 0 || h == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for dy in 0..h {
|
||||||
|
let row_base = (dy as usize) * (w as usize);
|
||||||
|
for dx in 0..w {
|
||||||
|
let idx = row_base + dx as usize;
|
||||||
|
if idx >= samples.len() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let (lo, hi) = samples[idx];
|
||||||
|
self.write_sample_64bpp(base_tiles, pitch_tiles_32bpp, x + dx, y + dy, lo, hi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fill a `(w × h)`-sample rectangle with a constant 64bpp pattern.
|
||||||
|
/// `lo` lands at the low-addressed 32bpp word, `hi` at the high one
|
||||||
|
/// — i.e. for clears, callers pass `(lo = RB_COLOR_CLEAR_LO,
|
||||||
|
/// hi = RB_COLOR_CLEAR)` per Canary `draw_util.cc:1302-1303`.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub fn fill_rect_64bpp(
|
||||||
|
&mut self,
|
||||||
|
base_tiles: u16,
|
||||||
|
pitch_tiles_32bpp: u32,
|
||||||
|
x: u32,
|
||||||
|
y: u32,
|
||||||
|
w: u32,
|
||||||
|
h: u32,
|
||||||
|
lo: u32,
|
||||||
|
hi: u32,
|
||||||
|
) {
|
||||||
|
if w == 0 || h == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for dy in 0..h {
|
||||||
|
for dx in 0..w {
|
||||||
|
self.write_sample_64bpp(
|
||||||
|
base_tiles,
|
||||||
|
pitch_tiles_32bpp,
|
||||||
|
x + dx,
|
||||||
|
y + dy,
|
||||||
|
lo,
|
||||||
|
hi,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shadow_edram_is_exactly_10_mib() {
|
||||||
|
assert_eq!(EDRAM_SIZE_BYTES, 10 * 1024 * 1024);
|
||||||
|
let e = ShadowEdram::new();
|
||||||
|
assert_eq!(e.as_bytes().len(), 10 * 1024 * 1024);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fill_rect_writes_the_whole_first_tile() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
e.fill_rect_32bpp(0, 1, 0, 0, 80, 16, 0x11223344);
|
||||||
|
// Every 4-byte sample in tile 0 should be 0x11223344 (LE).
|
||||||
|
let expected = 0x11223344u32.to_le_bytes();
|
||||||
|
let tile = e.tile(0);
|
||||||
|
for chunk in tile.chunks_exact(4) {
|
||||||
|
assert_eq!(chunk, expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fill_rect_respects_pitch_and_base() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
// Surface: pitch=2 tiles, base=5. A 160x16 fill should land in
|
||||||
|
// tiles 5 and 6 — and leave tile 4 / tile 7 / tile 0 untouched.
|
||||||
|
e.fill_rect_32bpp(5, 2, 0, 0, 160, 16, 0xAABBCCDD);
|
||||||
|
let expected = 0xAABBCCDDu32.to_le_bytes();
|
||||||
|
for chunk in e.tile(5).chunks_exact(4) {
|
||||||
|
assert_eq!(chunk, expected);
|
||||||
|
}
|
||||||
|
for chunk in e.tile(6).chunks_exact(4) {
|
||||||
|
assert_eq!(chunk, expected);
|
||||||
|
}
|
||||||
|
assert!(e.tile(4).iter().all(|&b| b == 0));
|
||||||
|
assert!(e.tile(7).iter().all(|&b| b == 0));
|
||||||
|
assert!(e.tile(0).iter().all(|&b| b == 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fill_rect_wraps_mod_2048() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
// base=2047, pitch=2: first tile is 2047, second wraps to 0.
|
||||||
|
e.fill_rect_32bpp(2047, 2, 0, 0, 160, 16, 0xDEAD_BEEF);
|
||||||
|
let expected = 0xDEAD_BEEFu32.to_le_bytes();
|
||||||
|
for chunk in e.tile(2047).chunks_exact(4) {
|
||||||
|
assert_eq!(chunk, expected);
|
||||||
|
}
|
||||||
|
for chunk in e.tile(0).chunks_exact(4) {
|
||||||
|
assert_eq!(chunk, expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn read_sample_roundtrips_fill_rect() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
e.fill_rect_32bpp(3, 1, 0, 0, 80, 16, 0xCAFE_F00D);
|
||||||
|
// Sample any interior point.
|
||||||
|
assert_eq!(e.read_sample_32bpp(3, 1, 0, 0), 0xCAFE_F00D);
|
||||||
|
assert_eq!(e.read_sample_32bpp(3, 1, 79, 15), 0xCAFE_F00D);
|
||||||
|
// Untouched neighbouring tile.
|
||||||
|
assert_eq!(e.read_sample_32bpp(4, 1, 0, 0), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn zero_pitch_is_a_noop_read() {
|
||||||
|
let e = ShadowEdram::new();
|
||||||
|
assert_eq!(e.read_sample_32bpp(0, 0, 10, 10), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `write_sample_32bpp` round-trips through `read_sample_32bpp`.
|
||||||
|
#[test]
|
||||||
|
fn write_sample_32bpp_round_trips() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
for x in 0..80u32 {
|
||||||
|
for y in 0..16u32 {
|
||||||
|
e.write_sample_32bpp(0, 1, x, y, 0xABCD_0000 | (y << 8) | x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for x in 0..80u32 {
|
||||||
|
for y in 0..16u32 {
|
||||||
|
assert_eq!(
|
||||||
|
e.read_sample_32bpp(0, 1, x, y),
|
||||||
|
0xABCD_0000 | (y << 8) | x,
|
||||||
|
"round-trip mismatch at ({x},{y})"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `write_rect_32bpp` writes row-major samples into the right
|
||||||
|
/// sample-offsets, including across tile boundaries.
|
||||||
|
#[test]
|
||||||
|
fn write_rect_32bpp_crosses_tile_boundary() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
// Surface pitch = 2 tiles → x in [0, 160), y in [0, 16). A 100x4
|
||||||
|
// rect at (40, 4) crosses x=80 (tile boundary).
|
||||||
|
let w = 100u32;
|
||||||
|
let h = 4u32;
|
||||||
|
let mut samples = Vec::with_capacity((w * h) as usize);
|
||||||
|
for dy in 0..h {
|
||||||
|
for dx in 0..w {
|
||||||
|
samples.push(0x10000 | (dy << 8) | dx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
e.write_rect_32bpp(0, 2, 40, 4, w, h, &samples);
|
||||||
|
// Spot-check: (40, 4) lands in tile 0; (140, 4) in tile 1.
|
||||||
|
assert_eq!(e.read_sample_32bpp(0, 2, 40, 4), 0x1_0000);
|
||||||
|
assert_eq!(
|
||||||
|
e.read_sample_32bpp(0, 2, 139, 7),
|
||||||
|
0x10000 | (3 << 8) | 99
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `read_sample_64bpp` round-trips through `write_sample_64bpp` —
|
||||||
|
/// doubled-pitch addressing keeps lo/hi adjacent in EDRAM bytes.
|
||||||
|
#[test]
|
||||||
|
fn write_read_sample_64bpp_roundtrips() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
// Use 32bpp pitch=1, base=0 → 64bpp pitch=2, base=0. A single-tile
|
||||||
|
// 64bpp surface fits 80x16 logical 64bpp samples? No — 80x16 32bpp
|
||||||
|
// samples per tile, 80 logical 64bpp samples per *pair* of tiles,
|
||||||
|
// and our 80×16 region needs 2 tiles. Stick to 16x4 logical 64bpp.
|
||||||
|
for x in 0..16u32 {
|
||||||
|
for y in 0..4u32 {
|
||||||
|
e.write_sample_64bpp(0, 1, x, y, 0xAAAA_0000 | x, 0xBBBB_0000 | y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for x in 0..16u32 {
|
||||||
|
for y in 0..4u32 {
|
||||||
|
let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
|
||||||
|
assert_eq!(lo, 0xAAAA_0000 | x);
|
||||||
|
assert_eq!(hi, 0xBBBB_0000 | y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `fill_rect_64bpp` writes both the lo and hi clear words across
|
||||||
|
/// a 64bpp surface — matches the `RB_COLOR_CLEAR_LO`/`RB_COLOR_CLEAR`
|
||||||
|
/// convention.
|
||||||
|
#[test]
|
||||||
|
fn fill_rect_64bpp_writes_both_words() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
// 16x4 logical 64bpp samples; pitch=1 32bpp tile → 2 64bpp tiles.
|
||||||
|
e.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xCAFE_F00D, 0xDEAD_BEEF);
|
||||||
|
for x in 0..16u32 {
|
||||||
|
for y in 0..4u32 {
|
||||||
|
let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
|
||||||
|
assert_eq!(lo, 0xCAFE_F00D);
|
||||||
|
assert_eq!(hi, 0xDEAD_BEEF);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 64bpp helpers must respect the doubled tile pitch — adjacent logical
|
||||||
|
/// 64bpp samples must land at adjacent 32bpp samples in EDRAM.
|
||||||
|
#[test]
|
||||||
|
fn sixty_four_bpp_uses_doubled_pitch() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
e.write_sample_64bpp(0, 1, 5, 0, 0x1111_1111, 0x2222_2222);
|
||||||
|
// The lo word must sit at 32bpp x=10 (5 << 1), hi at x=11.
|
||||||
|
// Doubled pitch -> base=0, pitch=2 32bpp.
|
||||||
|
assert_eq!(e.read_sample_32bpp(0, 2, 10, 0), 0x1111_1111);
|
||||||
|
assert_eq!(e.read_sample_32bpp(0, 2, 11, 0), 0x2222_2222);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `write_rect_*` with empty dimensions is a no-op.
|
||||||
|
#[test]
|
||||||
|
fn write_rect_empty_is_noop() {
|
||||||
|
let mut e = ShadowEdram::new();
|
||||||
|
e.write_rect_32bpp(0, 1, 0, 0, 0, 5, &[1, 2, 3]);
|
||||||
|
e.write_rect_32bpp(0, 1, 0, 0, 5, 0, &[1, 2, 3]);
|
||||||
|
e.fill_rect_64bpp(0, 1, 0, 0, 0, 5, 1, 2);
|
||||||
|
e.fill_rect_64bpp(0, 1, 0, 0, 5, 0, 1, 2);
|
||||||
|
// Nothing should have been written.
|
||||||
|
assert!(e.as_bytes().iter().all(|&b| b == 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
1753
crates/xenia-gpu/src/gpu_system.rs
Normal file
1753
crates/xenia-gpu/src/gpu_system.rs
Normal file
File diff suppressed because it is too large
Load Diff
1010
crates/xenia-gpu/src/handle.rs
Normal file
1010
crates/xenia-gpu/src/handle.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,21 +1,49 @@
|
|||||||
|
//! Xenos GPU emulation for xenia-rs.
|
||||||
|
//!
|
||||||
|
//! Modules:
|
||||||
|
//! - [`pm4`]: packet format decoder + Type-3 opcode set.
|
||||||
|
//! - [`ring_view`]: ring-buffer bookkeeping (base/size/read/write pointers).
|
||||||
|
//! - [`register_file`]: 0x6000-entry register array backing the CP + state.
|
||||||
|
//! - [`gpu_system`]: top-level `GpuSystem` + PM4 executor running one packet
|
||||||
|
//! per call (see the plan's P2 for the design rationale).
|
||||||
|
//!
|
||||||
|
//! Legacy module `ring_drain` and `command_processor` are retained while P3+
|
||||||
|
//! migrations finish; they will be removed once every caller is on
|
||||||
|
//! [`gpu_system::GpuSystem`].
|
||||||
|
|
||||||
pub mod command_processor;
|
pub mod command_processor;
|
||||||
|
pub mod draw_state;
|
||||||
|
pub mod edram;
|
||||||
|
pub mod gpu_system;
|
||||||
|
pub mod handle;
|
||||||
|
pub mod mmio_region;
|
||||||
|
pub mod pm4;
|
||||||
|
pub mod primitive;
|
||||||
pub mod register_file;
|
pub mod register_file;
|
||||||
|
pub mod ring_drain;
|
||||||
|
pub mod ring_view;
|
||||||
|
pub mod render_target_cache;
|
||||||
|
pub mod resolve;
|
||||||
|
pub mod shader_metrics;
|
||||||
|
pub mod shaders;
|
||||||
|
pub mod texture_cache;
|
||||||
|
pub mod tiled_address;
|
||||||
|
pub mod translator;
|
||||||
|
pub mod ucode;
|
||||||
|
pub mod xenos_constants;
|
||||||
|
|
||||||
/// Stub GPU system for initial implementation.
|
pub use gpu_system::{
|
||||||
pub struct GpuSystem {
|
ExecOutcome, GpuBlock, GpuMmio, GpuStats, GpuSystem, InterruptSource, PendingInterrupt,
|
||||||
pub register_file: register_file::RegisterFile,
|
ShaderBlob, SwapNotification, WaitCmp,
|
||||||
}
|
};
|
||||||
|
pub use handle::{
|
||||||
impl GpuSystem {
|
DrainReply, GpuBackend, GpuCommand, GpuDigestSnapshot, GpuHandle, GpuWorker,
|
||||||
pub fn new() -> Self {
|
shutdown_and_join_with_timeout, spawn_gpu_worker, spawn_noop_worker,
|
||||||
Self {
|
};
|
||||||
register_file: register_file::RegisterFile::new(),
|
pub use mmio_region::build_region as build_mmio_region;
|
||||||
}
|
pub use pm4::{
|
||||||
}
|
PacketHeader, PacketKind, PM4_INTERRUPT, PM4_NOP, PM4_XE_SWAP, SWAP_SIGNATURE,
|
||||||
}
|
type3_opcode_name,
|
||||||
|
};
|
||||||
impl Default for GpuSystem {
|
pub use ring_drain::{DrainResult, drain};
|
||||||
fn default() -> Self {
|
pub use ring_view::RingBufferView;
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
217
crates/xenia-gpu/src/mmio_region.rs
Normal file
217
crates/xenia-gpu/src/mmio_region.rs
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
//! Construct an `xenia_memory::MmioRegion` that backs the Xenos GPU register
|
||||||
|
//! aperture at guest physical `0x7FC80000` (per canary
|
||||||
|
//! `graphics_system.cc:141-144` — `memory_->AddVirtualMappedRange(0x7FC80000,
|
||||||
|
//! 0xFFFF0000, 0x0000FFFF, …)`).
|
||||||
|
//!
|
||||||
|
//! Only a handful of registers need a round-trip over the bus; everything
|
||||||
|
//! else (the ALU / fetch constants, the RBBM state machine, …) lives inside
|
||||||
|
//! `GpuSystem::register_file` and is driven by PM4 packets from the CP on
|
||||||
|
//! the same host thread.
|
||||||
|
//!
|
||||||
|
//! The read/write closures capture `Arc<AtomicU32>` mailboxes cloned from
|
||||||
|
//! [`crate::GpuMmio`]; [`crate::GpuSystem::sync_with_mmio`] samples them
|
||||||
|
//! each scheduler round.
|
||||||
|
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
|
||||||
|
use xenia_memory::MmioRegion;
|
||||||
|
|
||||||
|
use crate::gpu_system::{reg, GpuMmio};
|
||||||
|
|
||||||
|
/// Xenos GPU register aperture base (guest physical address). Matches
|
||||||
|
/// canary's `graphics_system.cc:141`.
|
||||||
|
pub const APERTURE_BASE: u32 = 0x7FC8_0000;
|
||||||
|
/// Mask used by `MmioRegion::contains` so any `0x7FC8xxxx` address hits.
|
||||||
|
pub const APERTURE_MASK: u32 = 0xFFFF_0000;
|
||||||
|
/// Total aperture size in bytes (enough for the low 16-bit register window).
|
||||||
|
pub const APERTURE_SIZE: u32 = 0x0001_0000;
|
||||||
|
|
||||||
|
/// Build the [`MmioRegion`] to install on the guest memory.
|
||||||
|
pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
|
||||||
|
let read_wptr = mmio.cp_rb_wptr.clone();
|
||||||
|
let read_rptr = mmio.cp_rb_rptr.clone();
|
||||||
|
let read_int_status = mmio.cp_int_status.clone();
|
||||||
|
let read_int_ack = mmio.cp_int_ack.clone();
|
||||||
|
let read_vblank_status = mmio.d1mode_vblank_vline_status.clone();
|
||||||
|
let write_wptr = mmio.cp_rb_wptr.clone();
|
||||||
|
let write_int_ack = mmio.cp_int_ack.clone();
|
||||||
|
let write_vblank_status = mmio.d1mode_vblank_vline_status.clone();
|
||||||
|
// M1.7 parker — captured into the WPTR write closure to wake a
|
||||||
|
// parked GPU worker on every guest WPTR write. In inline mode the
|
||||||
|
// mutex holds `None`, so the unpark site is a brief lock + no-op.
|
||||||
|
let wake_pending = mmio.wake_pending.clone();
|
||||||
|
let worker_thread = mmio.worker_thread.clone();
|
||||||
|
|
||||||
|
MmioRegion {
|
||||||
|
base_address: APERTURE_BASE,
|
||||||
|
mask: APERTURE_MASK,
|
||||||
|
size: APERTURE_SIZE,
|
||||||
|
read_callback: Box::new(move |addr: u32| {
|
||||||
|
let reg_index = (addr & 0xFFFF) / 4;
|
||||||
|
match reg_index {
|
||||||
|
reg::CP_RB_WPTR => read_wptr.load(Ordering::Relaxed),
|
||||||
|
reg::CP_RB_RPTR => read_rptr.load(Ordering::Relaxed),
|
||||||
|
reg::CP_INT_STATUS => read_int_status.load(Ordering::Relaxed),
|
||||||
|
// Games sometimes read-back the ack register to check interrupt ownership
|
||||||
|
// — serve the last-written value.
|
||||||
|
reg::CP_INT_ACK => read_int_ack.load(Ordering::Relaxed),
|
||||||
|
reg::D1MODE_VBLANK_VLINE_STATUS => {
|
||||||
|
read_vblank_status.load(Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
tracing::trace!(
|
||||||
|
reg = format_args!("{reg_index:#x}"),
|
||||||
|
addr = format_args!("{addr:#010x}"),
|
||||||
|
"gpu mmio: unmapped read (returning 0)"
|
||||||
|
);
|
||||||
|
0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
write_callback: Box::new(move |addr: u32, value: u32| {
|
||||||
|
let reg_index = (addr & 0xFFFF) / 4;
|
||||||
|
match reg_index {
|
||||||
|
reg::CP_RB_WPTR => {
|
||||||
|
// Release: any prior writes to ring memory the guest
|
||||||
|
// performed before bumping WPTR must be visible to
|
||||||
|
// the GPU consumer that Acquire-loads this atomic.
|
||||||
|
write_wptr.store(value, Ordering::Release);
|
||||||
|
// M1.7 parker wake: set the pending bit (Release) so
|
||||||
|
// a worker swapping it on its way to `park_timeout`
|
||||||
|
// sees `was_pending == true` and skips the park; AND
|
||||||
|
// unpark the worker if it's already parked. Both are
|
||||||
|
// necessary to defend against the race window between
|
||||||
|
// the worker's `swap(false)` and `park_timeout()`.
|
||||||
|
wake_pending.store(true, Ordering::Release);
|
||||||
|
if let Ok(g) = worker_thread.lock() {
|
||||||
|
if let Some(t) = g.as_ref() {
|
||||||
|
t.unpark();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tracing::trace!(
|
||||||
|
value,
|
||||||
|
addr = format_args!("{addr:#010x}"),
|
||||||
|
"gpu mmio: CP_RB_WPTR write"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// CP_INT_ACK clears interrupt bits; we just echo the value.
|
||||||
|
reg::CP_INT_ACK => {
|
||||||
|
write_int_ack.store(value, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
// D1MODE_VBLANK_VLINE_STATUS is write-1-to-clear per the
|
||||||
|
// AMD M56 display-controller ref. Clear any bit the guest
|
||||||
|
// writes a 1 to (leaving other bits untouched).
|
||||||
|
reg::D1MODE_VBLANK_VLINE_STATUS => {
|
||||||
|
let prev = write_vblank_status.load(Ordering::Relaxed);
|
||||||
|
write_vblank_status.store(prev & !value, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
tracing::trace!(
|
||||||
|
reg = format_args!("{reg_index:#x}"),
|
||||||
|
addr = format_args!("{addr:#010x}"),
|
||||||
|
value = format_args!("{value:#x}"),
|
||||||
|
"gpu mmio: unmapped write (dropping)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn build() -> (GpuMmio, MmioRegion) {
|
||||||
|
let mmio = GpuMmio::new();
|
||||||
|
let region = build_region(&mmio);
|
||||||
|
(mmio, region)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `D1MODE_VBLANK_VLINE_STATUS` read must surface the atomic's current
|
||||||
|
/// value — Sylpheed's graphics-interrupt callback reads bit 0 to decide
|
||||||
|
/// whether vblank actually fired; if we always return 0 the callback
|
||||||
|
/// silently skips every frame's work.
|
||||||
|
#[test]
|
||||||
|
fn vblank_status_read_returns_stored_value() {
|
||||||
|
let (mmio, region) = build();
|
||||||
|
mmio.d1mode_vblank_vline_status
|
||||||
|
.store(0x1, Ordering::Relaxed);
|
||||||
|
let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
|
||||||
|
assert_eq!((region.read_callback)(offset), 0x1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Guest clears the flag by writing 1 back. Classic write-1-to-clear —
|
||||||
|
/// AMD M56 display-controller ref and Canary's behavior. We preserve
|
||||||
|
/// unrelated bits so higher-bit status (VLINE_INT_OCCURRED etc.) can
|
||||||
|
/// coexist with a concurrent clear of bit 0.
|
||||||
|
#[test]
|
||||||
|
fn vblank_status_write_1_to_clear() {
|
||||||
|
let (mmio, region) = build();
|
||||||
|
mmio.d1mode_vblank_vline_status
|
||||||
|
.store(0b11, Ordering::Relaxed);
|
||||||
|
let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
|
||||||
|
(region.write_callback)(offset, 0b01);
|
||||||
|
assert_eq!(
|
||||||
|
mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
|
||||||
|
0b10,
|
||||||
|
"bit 0 cleared, bit 1 preserved"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write-0-to-a-bit must NOT clear that bit — classic W1TC semantics.
|
||||||
|
#[test]
|
||||||
|
fn vblank_status_write_0_is_noop() {
|
||||||
|
let (mmio, region) = build();
|
||||||
|
mmio.d1mode_vblank_vline_status
|
||||||
|
.store(0b11, Ordering::Relaxed);
|
||||||
|
let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
|
||||||
|
(region.write_callback)(offset, 0x0);
|
||||||
|
assert_eq!(
|
||||||
|
mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
|
||||||
|
0b11
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Regression: prior to the fix, `reg::CP_RB_WPTR` held a byte offset
|
||||||
|
/// (`0x0714`) while the match arm compared against a *register index*
|
||||||
|
/// (`(addr & 0xFFFF) / 4 == 0x01C5`). Guest MMIO writes to the WPTR
|
||||||
|
/// therefore fell through to "unmapped" and the atomic never moved;
|
||||||
|
/// only `VdInitializeRingBuffer` / `extend_write_ptr` paths worked.
|
||||||
|
///
|
||||||
|
/// Verify every CP register lands in its atomic when the guest writes
|
||||||
|
/// at the canonical `APERTURE_BASE + index*4` byte address.
|
||||||
|
#[test]
|
||||||
|
fn cp_rb_wptr_write_via_mmio_bus_reaches_atomic() {
|
||||||
|
let (mmio, region) = build();
|
||||||
|
let offset = APERTURE_BASE + reg::CP_RB_WPTR * 4;
|
||||||
|
assert_eq!(offset, 0x7FC8_0714, "byte offset must match Canary CP_RB_WPTR");
|
||||||
|
(region.write_callback)(offset, 0x1234_5678);
|
||||||
|
assert_eq!(mmio.cp_rb_wptr.load(Ordering::Relaxed), 0x1234_5678);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cp_int_ack_write_via_mmio_bus_reaches_atomic() {
|
||||||
|
let (mmio, region) = build();
|
||||||
|
let offset = APERTURE_BASE + reg::CP_INT_ACK * 4;
|
||||||
|
assert_eq!(offset, 0x7FC8_07D0, "byte offset must match Canary CP_INT_ACK");
|
||||||
|
(region.write_callback)(offset, 0xDEAD_BEEF);
|
||||||
|
assert_eq!(mmio.cp_int_ack.load(Ordering::Relaxed), 0xDEAD_BEEF);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cp_rb_rptr_read_via_mmio_bus_returns_atomic() {
|
||||||
|
let (mmio, region) = build();
|
||||||
|
mmio.cp_rb_rptr.store(0xCAFE_F00D, Ordering::Relaxed);
|
||||||
|
let offset = APERTURE_BASE + reg::CP_RB_RPTR * 4;
|
||||||
|
assert_eq!((region.read_callback)(offset), 0xCAFE_F00D);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cp_int_status_read_via_mmio_bus_returns_atomic() {
|
||||||
|
let (mmio, region) = build();
|
||||||
|
mmio.cp_int_status.store(0x0000_0001, Ordering::Relaxed);
|
||||||
|
let offset = APERTURE_BASE + reg::CP_INT_STATUS * 4;
|
||||||
|
assert_eq!((region.read_callback)(offset), 0x0000_0001);
|
||||||
|
}
|
||||||
|
}
|
||||||
232
crates/xenia-gpu/src/pm4.rs
Normal file
232
crates/xenia-gpu/src/pm4.rs
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
//! PM4 packet format — header decoding + Type-3 opcode set.
|
||||||
|
//!
|
||||||
|
//! Xenos PM4 packet layout mirrors `xenia-canary/src/xenia/gpu/packet_disassembler.cc`:
|
||||||
|
//!
|
||||||
|
//! - **Type 0** (`packet >> 30 == 0`): register-write run.
|
||||||
|
//! `count = ((packet >> 16) & 0x3FFF) + 1`. Total dwords = `1 + count`.
|
||||||
|
//! With `(packet >> 15) & 1 == 1`, all writes target the same register.
|
||||||
|
//! - **Type 1** (`packet >> 30 == 1`): two-register write. Total dwords = 3.
|
||||||
|
//! - **Type 2** (`packet >> 30 == 2`): NOP — a single skipped dword.
|
||||||
|
//! - **Type 3** (`packet >> 30 == 3`): command.
|
||||||
|
//! `opcode = (packet >> 8) & 0x7F`,
|
||||||
|
//! `count = ((packet >> 16) & 0x3FFF) + 1`.
|
||||||
|
//! Total dwords = `1 + count`.
|
||||||
|
|
||||||
|
/// The cookie canary writes alongside `PM4_XE_SWAP` so tooling can recognize
|
||||||
|
/// swap packets. `'X','E','N','X'` big-endian (`kSwapSignature`).
|
||||||
|
pub const SWAP_SIGNATURE: u32 = 0x584E_4558;
|
||||||
|
|
||||||
|
// ── Named Type-3 opcodes (from xenia-canary/src/xenia/gpu/xenos.h:1617-1679) ──
|
||||||
|
|
||||||
|
pub const PM4_ME_INIT: u8 = 0x48;
|
||||||
|
pub const PM4_NOP: u8 = 0x10;
|
||||||
|
pub const PM4_INDIRECT_BUFFER: u8 = 0x3F;
|
||||||
|
pub const PM4_INDIRECT_BUFFER_PFD: u8 = 0x37;
|
||||||
|
pub const PM4_WAIT_FOR_IDLE: u8 = 0x26;
|
||||||
|
pub const PM4_WAIT_REG_MEM: u8 = 0x3C;
|
||||||
|
pub const PM4_REG_RMW: u8 = 0x21;
|
||||||
|
pub const PM4_REG_TO_MEM: u8 = 0x3E;
|
||||||
|
pub const PM4_MEM_WRITE: u8 = 0x3D;
|
||||||
|
pub const PM4_COND_WRITE: u8 = 0x45;
|
||||||
|
pub const PM4_EVENT_WRITE: u8 = 0x46;
|
||||||
|
pub const PM4_EVENT_WRITE_SHD: u8 = 0x58;
|
||||||
|
pub const PM4_EVENT_WRITE_EXT: u8 = 0x5A;
|
||||||
|
pub const PM4_EVENT_WRITE_ZPD: u8 = 0x5B;
|
||||||
|
pub const PM4_DRAW_INDX: u8 = 0x22;
|
||||||
|
pub const PM4_DRAW_INDX_2: u8 = 0x36;
|
||||||
|
pub const PM4_VIZ_QUERY: u8 = 0x23;
|
||||||
|
pub const PM4_SET_CONSTANT: u8 = 0x2D;
|
||||||
|
pub const PM4_SET_CONSTANT2: u8 = 0x55;
|
||||||
|
pub const PM4_SET_SHADER_CONSTANTS: u8 = 0x56;
|
||||||
|
pub const PM4_LOAD_ALU_CONSTANT: u8 = 0x2F;
|
||||||
|
pub const PM4_IM_LOAD: u8 = 0x27;
|
||||||
|
pub const PM4_IM_LOAD_IMMEDIATE: u8 = 0x2B;
|
||||||
|
pub const PM4_LOAD_CONSTANT_CONTEXT: u8 = 0x2E;
|
||||||
|
pub const PM4_INVALIDATE_STATE: u8 = 0x3B;
|
||||||
|
pub const PM4_INTERRUPT: u8 = 0x54;
|
||||||
|
pub const PM4_SET_SHADER_BASES: u8 = 0x4A;
|
||||||
|
pub const PM4_SET_BIN_MASK_LO: u8 = 0x60;
|
||||||
|
pub const PM4_SET_BIN_MASK_HI: u8 = 0x61;
|
||||||
|
pub const PM4_SET_BIN_SELECT_LO: u8 = 0x62;
|
||||||
|
pub const PM4_SET_BIN_SELECT_HI: u8 = 0x63;
|
||||||
|
pub const PM4_SET_BIN_MASK: u8 = 0x50;
|
||||||
|
pub const PM4_SET_BIN_SELECT: u8 = 0x51;
|
||||||
|
pub const PM4_CONTEXT_UPDATE: u8 = 0x5E;
|
||||||
|
/// Xenia-specific: `VdSwap` writes this to trigger a present.
|
||||||
|
pub const PM4_XE_SWAP: u8 = 0x64;
|
||||||
|
|
||||||
|
/// Human-readable name for a Type-3 opcode. Used for tracing spans.
|
||||||
|
pub fn type3_opcode_name(op: u8) -> &'static str {
|
||||||
|
match op {
|
||||||
|
PM4_ME_INIT => "ME_INIT",
|
||||||
|
PM4_NOP => "NOP",
|
||||||
|
PM4_INDIRECT_BUFFER => "INDIRECT_BUFFER",
|
||||||
|
PM4_INDIRECT_BUFFER_PFD => "INDIRECT_BUFFER_PFD",
|
||||||
|
PM4_WAIT_FOR_IDLE => "WAIT_FOR_IDLE",
|
||||||
|
PM4_WAIT_REG_MEM => "WAIT_REG_MEM",
|
||||||
|
PM4_REG_RMW => "REG_RMW",
|
||||||
|
PM4_REG_TO_MEM => "REG_TO_MEM",
|
||||||
|
PM4_MEM_WRITE => "MEM_WRITE",
|
||||||
|
PM4_COND_WRITE => "COND_WRITE",
|
||||||
|
PM4_EVENT_WRITE => "EVENT_WRITE",
|
||||||
|
PM4_EVENT_WRITE_SHD => "EVENT_WRITE_SHD",
|
||||||
|
PM4_EVENT_WRITE_EXT => "EVENT_WRITE_EXT",
|
||||||
|
PM4_EVENT_WRITE_ZPD => "EVENT_WRITE_ZPD",
|
||||||
|
PM4_DRAW_INDX => "DRAW_INDX",
|
||||||
|
PM4_DRAW_INDX_2 => "DRAW_INDX_2",
|
||||||
|
PM4_VIZ_QUERY => "VIZ_QUERY",
|
||||||
|
PM4_SET_CONSTANT => "SET_CONSTANT",
|
||||||
|
PM4_SET_CONSTANT2 => "SET_CONSTANT2",
|
||||||
|
PM4_SET_SHADER_CONSTANTS => "SET_SHADER_CONSTANTS",
|
||||||
|
PM4_LOAD_ALU_CONSTANT => "LOAD_ALU_CONSTANT",
|
||||||
|
PM4_LOAD_CONSTANT_CONTEXT => "LOAD_CONSTANT_CONTEXT",
|
||||||
|
PM4_IM_LOAD => "IM_LOAD",
|
||||||
|
PM4_IM_LOAD_IMMEDIATE => "IM_LOAD_IMMEDIATE",
|
||||||
|
PM4_INVALIDATE_STATE => "INVALIDATE_STATE",
|
||||||
|
PM4_INTERRUPT => "INTERRUPT",
|
||||||
|
PM4_SET_SHADER_BASES => "SET_SHADER_BASES",
|
||||||
|
PM4_SET_BIN_MASK_LO => "SET_BIN_MASK_LO",
|
||||||
|
PM4_SET_BIN_MASK_HI => "SET_BIN_MASK_HI",
|
||||||
|
PM4_SET_BIN_SELECT_LO => "SET_BIN_SELECT_LO",
|
||||||
|
PM4_SET_BIN_SELECT_HI => "SET_BIN_SELECT_HI",
|
||||||
|
PM4_SET_BIN_MASK => "SET_BIN_MASK",
|
||||||
|
PM4_SET_BIN_SELECT => "SET_BIN_SELECT",
|
||||||
|
PM4_CONTEXT_UPDATE => "CONTEXT_UPDATE",
|
||||||
|
PM4_XE_SWAP => "XE_SWAP",
|
||||||
|
_ => "UNKNOWN",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decoded single PM4 packet header.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct PacketHeader {
|
||||||
|
pub kind: PacketKind,
|
||||||
|
/// Total size of the packet (including header) in dwords.
|
||||||
|
pub total_dwords: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Classification of a PM4 packet.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum PacketKind {
|
||||||
|
/// Type-0 register-write run. `base_index` is the first register index
|
||||||
|
/// (the register offset / 4). `write_one` is true if all `count` data
|
||||||
|
/// dwords write to the same register.
|
||||||
|
Type0 {
|
||||||
|
base_index: u32,
|
||||||
|
count: u32,
|
||||||
|
write_one: bool,
|
||||||
|
},
|
||||||
|
/// Type-1 two-register write.
|
||||||
|
Type1 { reg_index_1: u32, reg_index_2: u32 },
|
||||||
|
/// Type-2 NOP (a single skipped dword).
|
||||||
|
Type2,
|
||||||
|
/// Type-3 command.
|
||||||
|
Type3 {
|
||||||
|
opcode: u8,
|
||||||
|
count: u32,
|
||||||
|
predicated: bool,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode a single PM4 packet header.
|
||||||
|
pub fn decode(header: u32) -> PacketHeader {
|
||||||
|
match header >> 30 {
|
||||||
|
0 => {
|
||||||
|
let count = ((header >> 16) & 0x3FFF) + 1;
|
||||||
|
PacketHeader {
|
||||||
|
kind: PacketKind::Type0 {
|
||||||
|
base_index: header & 0x7FFF,
|
||||||
|
count,
|
||||||
|
write_one: (header >> 15) & 1 != 0,
|
||||||
|
},
|
||||||
|
total_dwords: 1 + count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
1 => PacketHeader {
|
||||||
|
kind: PacketKind::Type1 {
|
||||||
|
reg_index_1: header & 0x7FF,
|
||||||
|
reg_index_2: (header >> 11) & 0x7FF,
|
||||||
|
},
|
||||||
|
total_dwords: 3,
|
||||||
|
},
|
||||||
|
2 => PacketHeader {
|
||||||
|
kind: PacketKind::Type2,
|
||||||
|
total_dwords: 1,
|
||||||
|
},
|
||||||
|
3 => {
|
||||||
|
let count = ((header >> 16) & 0x3FFF) + 1;
|
||||||
|
PacketHeader {
|
||||||
|
kind: PacketKind::Type3 {
|
||||||
|
opcode: ((header >> 8) & 0x7F) as u8,
|
||||||
|
count,
|
||||||
|
predicated: (header & 1) != 0,
|
||||||
|
},
|
||||||
|
total_dwords: 1 + count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn type2_is_one_dword() {
|
||||||
|
// 0x80000000 == type 2 header (bits 31:30 = 10)
|
||||||
|
let hdr = decode(0x8000_0000);
|
||||||
|
assert_eq!(hdr.kind, PacketKind::Type2);
|
||||||
|
assert_eq!(hdr.total_dwords, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn type0_count_is_inclusive() {
|
||||||
|
// count field (bits 29:16) = 5 → 6 data dwords. base_index = 0x100.
|
||||||
|
// write_one = 0.
|
||||||
|
let hdr = decode((5 << 16) | 0x100);
|
||||||
|
match hdr.kind {
|
||||||
|
PacketKind::Type0 {
|
||||||
|
base_index,
|
||||||
|
count,
|
||||||
|
write_one,
|
||||||
|
} => {
|
||||||
|
assert_eq!(base_index, 0x100);
|
||||||
|
assert_eq!(count, 6);
|
||||||
|
assert!(!write_one);
|
||||||
|
}
|
||||||
|
_ => panic!("expected Type0"),
|
||||||
|
}
|
||||||
|
assert_eq!(hdr.total_dwords, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn type3_swap_packet() {
|
||||||
|
// Build the exact header canary's VdSwap emits:
|
||||||
|
// MakePacketType3(PM4_XE_SWAP, 4) → ((3<<30) | ((4-1)<<16) | (0x64<<8))
|
||||||
|
let hdr_word = (3u32 << 30) | ((4u32 - 1) << 16) | ((PM4_XE_SWAP as u32) << 8);
|
||||||
|
let hdr = decode(hdr_word);
|
||||||
|
match hdr.kind {
|
||||||
|
PacketKind::Type3 {
|
||||||
|
opcode,
|
||||||
|
count,
|
||||||
|
predicated,
|
||||||
|
} => {
|
||||||
|
assert_eq!(opcode, PM4_XE_SWAP);
|
||||||
|
assert_eq!(count, 4);
|
||||||
|
assert!(!predicated);
|
||||||
|
}
|
||||||
|
_ => panic!("expected Type3"),
|
||||||
|
}
|
||||||
|
assert_eq!(hdr.total_dwords, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn opcode_names_are_present_for_common_ops() {
|
||||||
|
assert_eq!(type3_opcode_name(PM4_NOP), "NOP");
|
||||||
|
assert_eq!(type3_opcode_name(PM4_DRAW_INDX), "DRAW_INDX");
|
||||||
|
assert_eq!(type3_opcode_name(PM4_XE_SWAP), "XE_SWAP");
|
||||||
|
assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
|
||||||
|
assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
|
||||||
|
}
|
||||||
|
}
|
||||||
229
crates/xenia-gpu/src/primitive.rs
Normal file
229
crates/xenia-gpu/src/primitive.rs
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
//! Primitive processor — normalize Xenos primitives into host-GPU forms.
|
||||||
|
//!
|
||||||
|
//! wgpu only exposes `PrimitiveTopology::{PointList, LineList, LineStrip,
|
||||||
|
//! TriangleList, TriangleStrip}`. For everything else (fans, quads,
|
||||||
|
//! rectangles) we rewrite indices on the CPU side so the host just sees a
|
||||||
|
//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
|
||||||
|
//!
|
||||||
|
//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
|
||||||
|
//! (list, strip, fan). Rectangle + quad expansions are stubs logged via
|
||||||
|
//! `tracing::warn!` for later.
|
||||||
|
|
||||||
|
use crate::draw_state::{IndexSize, PrimitiveType};
|
||||||
|
|
||||||
|
/// Host primitive topology — a subset of wgpu's that we commit to.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum HostTopology {
|
||||||
|
PointList,
|
||||||
|
LineList,
|
||||||
|
LineStrip,
|
||||||
|
TriangleList,
|
||||||
|
TriangleStrip,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of primitive processing.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ProcessedPrimitive {
|
||||||
|
pub topology: HostTopology,
|
||||||
|
/// When the Xenos primitive needed client-side rewriting (fans, quads),
|
||||||
|
/// this buffer holds the rewritten 16-bit or 32-bit index sequence.
|
||||||
|
/// `None` means the input index buffer is usable as-is.
|
||||||
|
pub rewritten_indices: Option<Vec<u32>>,
|
||||||
|
/// Post-processing vertex count — equals the input count when indices
|
||||||
|
/// pass through unchanged.
|
||||||
|
pub host_vertex_count: u32,
|
||||||
|
/// `true` if we rejected the primitive (unsupported shape) and the
|
||||||
|
/// caller should skip this draw. Logged via `tracing::warn!`.
|
||||||
|
pub rejected: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize a draw.
|
||||||
|
///
|
||||||
|
/// `indices` is `None` for `AutoIndex` draws; otherwise it's the decoded
|
||||||
|
/// index stream (already endian-converted / widened to u32 by the caller).
|
||||||
|
pub fn process(
|
||||||
|
primitive: PrimitiveType,
|
||||||
|
vertex_count: u32,
|
||||||
|
indices: Option<&[u32]>,
|
||||||
|
) -> ProcessedPrimitive {
|
||||||
|
match primitive {
|
||||||
|
PrimitiveType::PointList => pass_through(HostTopology::PointList, vertex_count),
|
||||||
|
PrimitiveType::LineList => pass_through(HostTopology::LineList, vertex_count),
|
||||||
|
PrimitiveType::LineStrip => pass_through(HostTopology::LineStrip, vertex_count),
|
||||||
|
PrimitiveType::TriangleList => pass_through(HostTopology::TriangleList, vertex_count),
|
||||||
|
PrimitiveType::TriangleStrip => pass_through(HostTopology::TriangleStrip, vertex_count),
|
||||||
|
PrimitiveType::TriangleFan => expand_fan(indices, vertex_count),
|
||||||
|
PrimitiveType::RectangleList => expand_rectangles(indices, vertex_count),
|
||||||
|
PrimitiveType::QuadList => expand_quads(indices, vertex_count),
|
||||||
|
PrimitiveType::None | PrimitiveType::Unknown(_) => {
|
||||||
|
tracing::warn!(?primitive, "gpu: rejecting unsupported primitive");
|
||||||
|
metrics::counter!("gpu.primitive.rejected").increment(1);
|
||||||
|
ProcessedPrimitive {
|
||||||
|
topology: HostTopology::TriangleList,
|
||||||
|
rewritten_indices: None,
|
||||||
|
host_vertex_count: 0,
|
||||||
|
rejected: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pass_through(topology: HostTopology, vertex_count: u32) -> ProcessedPrimitive {
|
||||||
|
ProcessedPrimitive {
|
||||||
|
topology,
|
||||||
|
rewritten_indices: None,
|
||||||
|
host_vertex_count: vertex_count,
|
||||||
|
rejected: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert a triangle fan to a triangle list. Fan indices `[0, 1, 2, 3, 4]`
|
||||||
|
/// expand to triangles `(0,1,2), (0,2,3), (0,3,4)` — 3 × (n-2) host indices.
|
||||||
|
fn expand_fan(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||||
|
if vertex_count < 3 {
|
||||||
|
return ProcessedPrimitive {
|
||||||
|
topology: HostTopology::TriangleList,
|
||||||
|
rewritten_indices: Some(Vec::new()),
|
||||||
|
host_vertex_count: 0,
|
||||||
|
rejected: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
let mut out = Vec::with_capacity(3 * (vertex_count as usize - 2));
|
||||||
|
let get = |i: u32| -> u32 {
|
||||||
|
match indices {
|
||||||
|
Some(buf) => buf[i as usize],
|
||||||
|
None => i,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let apex = get(0);
|
||||||
|
for i in 1..vertex_count.saturating_sub(1) {
|
||||||
|
out.push(apex);
|
||||||
|
out.push(get(i));
|
||||||
|
out.push(get(i + 1));
|
||||||
|
}
|
||||||
|
let host_vertex_count = out.len() as u32;
|
||||||
|
ProcessedPrimitive {
|
||||||
|
topology: HostTopology::TriangleList,
|
||||||
|
rewritten_indices: Some(out),
|
||||||
|
host_vertex_count,
|
||||||
|
rejected: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert a quad list (groups of 4) to a triangle list (groups of 6).
|
||||||
|
fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||||
|
let quad_count = vertex_count / 4;
|
||||||
|
let mut out = Vec::with_capacity(6 * quad_count as usize);
|
||||||
|
let get = |i: u32| -> u32 {
|
||||||
|
match indices {
|
||||||
|
Some(buf) => buf[i as usize],
|
||||||
|
None => i,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for q in 0..quad_count {
|
||||||
|
let base = q * 4;
|
||||||
|
let a = get(base);
|
||||||
|
let b = get(base + 1);
|
||||||
|
let c = get(base + 2);
|
||||||
|
let d = get(base + 3);
|
||||||
|
out.extend_from_slice(&[a, b, c, a, c, d]);
|
||||||
|
}
|
||||||
|
let host_vertex_count = out.len() as u32;
|
||||||
|
ProcessedPrimitive {
|
||||||
|
topology: HostTopology::TriangleList,
|
||||||
|
rewritten_indices: Some(out),
|
||||||
|
host_vertex_count,
|
||||||
|
rejected: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rectangle lists: a Xenos-specific primitive where each group of 3
|
||||||
|
/// vertices defines a right-angle rectangle by its three non-repeated
|
||||||
|
/// corners (the 4th is derived). The uber-shader doesn't support this yet;
|
||||||
|
/// the ucode translator will emulate it as a geometry-stage fake. For P3
|
||||||
|
/// we emit an empty draw.
|
||||||
|
fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
|
||||||
|
tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
|
||||||
|
metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
|
||||||
|
ProcessedPrimitive {
|
||||||
|
topology: HostTopology::TriangleList,
|
||||||
|
rewritten_indices: Some(Vec::new()),
|
||||||
|
host_vertex_count: 0,
|
||||||
|
rejected: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Widen a u16 index buffer to u32. The primitive processor normalizes to
|
||||||
|
/// u32 so downstream wgpu pipeline descriptors stay simple.
|
||||||
|
pub fn widen_indices(raw: &[u8], size: IndexSize, count: u32) -> Vec<u32> {
|
||||||
|
let mut out = Vec::with_capacity(count as usize);
|
||||||
|
match size {
|
||||||
|
IndexSize::Sixteen => {
|
||||||
|
for i in 0..count as usize {
|
||||||
|
let off = i * 2;
|
||||||
|
if off + 2 > raw.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Xenos indices are big-endian on the wire.
|
||||||
|
let be = u16::from_be_bytes([raw[off], raw[off + 1]]);
|
||||||
|
out.push(be as u32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
IndexSize::ThirtyTwo => {
|
||||||
|
for i in 0..count as usize {
|
||||||
|
let off = i * 4;
|
||||||
|
if off + 4 > raw.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let be = u32::from_be_bytes([raw[off], raw[off + 1], raw[off + 2], raw[off + 3]]);
|
||||||
|
out.push(be);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn triangle_list_passes_through() {
|
||||||
|
let p = process(PrimitiveType::TriangleList, 6, None);
|
||||||
|
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||||
|
assert!(p.rewritten_indices.is_none());
|
||||||
|
assert_eq!(p.host_vertex_count, 6);
|
||||||
|
assert!(!p.rejected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fan_to_list_expands_correctly() {
|
||||||
|
// Fan of 5 vertices → triangles (0,1,2), (0,2,3), (0,3,4)
|
||||||
|
let p = process(PrimitiveType::TriangleFan, 5, None);
|
||||||
|
let idx = p.rewritten_indices.unwrap();
|
||||||
|
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 0, 3, 4]);
|
||||||
|
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||||
|
assert_eq!(p.host_vertex_count, 9);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn quad_list_expansion() {
|
||||||
|
let p = process(PrimitiveType::QuadList, 8, None);
|
||||||
|
let idx = p.rewritten_indices.unwrap();
|
||||||
|
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn widen_u16_indices_big_endian() {
|
||||||
|
// 3 indices [1, 2, 0x1234] in BE u16.
|
||||||
|
let raw = [0, 1, 0, 2, 0x12, 0x34];
|
||||||
|
let out = widen_indices(&raw, IndexSize::Sixteen, 3);
|
||||||
|
assert_eq!(out, vec![1, 2, 0x1234]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rejects_unknown_primitive() {
|
||||||
|
let p = process(PrimitiveType::Unknown(0x2A), 3, None);
|
||||||
|
assert!(p.rejected);
|
||||||
|
}
|
||||||
|
}
|
||||||
384
crates/xenia-gpu/src/render_target_cache.rs
Normal file
384
crates/xenia-gpu/src/render_target_cache.rs
Normal file
@@ -0,0 +1,384 @@
|
|||||||
|
//! EDRAM tile book + render-target key bookkeeping.
|
||||||
|
//!
|
||||||
|
//! Mirrors `xenia-canary/src/xenia/gpu/render_target_cache.h` at the data-
|
||||||
|
//! structure level. Xenos's 10 MiB EDRAM is divided into 2048 "tiles" of
|
||||||
|
//! 80×16 samples each; render targets claim a contiguous range of those
|
||||||
|
//! tiles based on `(base_tiles, pitch_tiles_at_32bpp, msaa_samples, format,
|
||||||
|
//! is_depth)`. Two render targets with overlapping tile ranges share the
|
||||||
|
//! underlying EDRAM — canary tracks this with per-tile "Host vs Shared"
|
||||||
|
//! ownership, which is what this module's `TileOwner` captures.
|
||||||
|
//!
|
||||||
|
//! P4 ships the **bookkeeping**. Actual host texture allocation per key (so
|
||||||
|
//! the host can draw into a wgpu texture matching the guest's RT) is left to
|
||||||
|
//! a future host-side cache built on top of this module; the same for
|
||||||
|
//! format-conversion compute shaders (the plan's P5 territory).
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Number of EDRAM tiles on Xenos. Matches canary's `xenos::kEdramTileCount`.
|
||||||
|
pub const EDRAM_TILE_COUNT: usize = 2048;
|
||||||
|
|
||||||
|
/// MSAA sample count encoded into [`RenderTargetKey`]. Canary uses this as
|
||||||
|
/// `xenos::MsaaSamples` (1×/2×/4×).
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum MsaaSamples {
|
||||||
|
X1 = 0,
|
||||||
|
X2 = 1,
|
||||||
|
X4 = 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MsaaSamples {
|
||||||
|
pub fn from_raw(raw: u32) -> Self {
|
||||||
|
match raw & 0x3 {
|
||||||
|
1 => MsaaSamples::X2,
|
||||||
|
2 => MsaaSamples::X4,
|
||||||
|
_ => MsaaSamples::X1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn count(self) -> u32 {
|
||||||
|
1u32 << (self as u32)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The packed EDRAM render-target identity. Bit layout matches
|
||||||
|
/// `render_target_cache.h:251-321`'s `RenderTargetKey` union (26 bits used,
|
||||||
|
/// stored as a single `u32` so it hashes cheaply). `pitch_tiles_at_32bpp`
|
||||||
|
/// is always the 32bpp-equivalent pitch — 64bpp targets halve their tile
|
||||||
|
/// pitch from the nominal tile grid (canary's `GetPitchTiles()` handles
|
||||||
|
/// that).
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub struct RenderTargetKey {
|
||||||
|
pub base_tiles: u16, // [0..2048)
|
||||||
|
pub pitch_tiles_at_32bpp: u16, // 0..=256 in practice
|
||||||
|
pub msaa_samples: MsaaSamples,
|
||||||
|
pub is_depth: bool,
|
||||||
|
/// Color format: `xenos::ColorRenderTargetFormat` when !is_depth.
|
||||||
|
/// Depth format: `xenos::DepthRenderTargetFormat` when is_depth.
|
||||||
|
pub resource_format: u8, // 4 bits
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RenderTargetKey {
|
||||||
|
/// Pack into canary's 26-bit layout. Useful for compact storage /
|
||||||
|
/// hashing when we add a LRU cache later on.
|
||||||
|
pub fn pack(&self) -> u32 {
|
||||||
|
(self.base_tiles as u32 & 0x7FF)
|
||||||
|
| (((self.pitch_tiles_at_32bpp as u32) & 0xFF) << 11)
|
||||||
|
| (((self.msaa_samples as u32) & 0x3) << 19)
|
||||||
|
| ((self.is_depth as u32) << 21)
|
||||||
|
| (((self.resource_format as u32) & 0xF) << 22)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn unpack(raw: u32) -> Self {
|
||||||
|
Self {
|
||||||
|
base_tiles: (raw & 0x7FF) as u16,
|
||||||
|
pitch_tiles_at_32bpp: ((raw >> 11) & 0xFF) as u16,
|
||||||
|
msaa_samples: MsaaSamples::from_raw((raw >> 19) & 0x3),
|
||||||
|
is_depth: ((raw >> 21) & 1) != 0,
|
||||||
|
resource_format: ((raw >> 22) & 0xF) as u8,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// How many EDRAM tiles the whole surface occupies (rough estimate; a
|
||||||
|
/// real height-aware calc needs viewport info). We conservatively use
|
||||||
|
/// `pitch_tiles_at_32bpp * 1` until a draw tells us otherwise; callers
|
||||||
|
/// that know the height can call [`tile_footprint_with_height`].
|
||||||
|
pub fn tile_pitch(&self) -> u16 {
|
||||||
|
// 64bpp formats pack two 32bpp tiles into one 64bpp tile.
|
||||||
|
if self.is_64bpp() {
|
||||||
|
self.pitch_tiles_at_32bpp / 2
|
||||||
|
} else {
|
||||||
|
self.pitch_tiles_at_32bpp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_64bpp(&self) -> bool {
|
||||||
|
if self.is_depth {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
// Canary: `ColorRenderTargetFormat::{k_16_16_16_16,
|
||||||
|
// k_16_16_16_16_FLOAT, k_32_32_FLOAT}` are 64bpp; indices 4, 5, 7
|
||||||
|
// in the enum. (Kept narrow because the enum is 4 bits wide.)
|
||||||
|
matches!(self.resource_format, 4 | 5 | 7)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tiles claimed by this RT if its surface height is `rows_of_tiles`
|
||||||
|
/// (i.e. `ceil(height_in_samples / 16)`).
|
||||||
|
pub fn tile_footprint_with_height(&self, rows_of_tiles: u16) -> u16 {
|
||||||
|
self.tile_pitch().saturating_mul(rows_of_tiles)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Who currently owns a tile of EDRAM.
|
||||||
|
///
|
||||||
|
/// `None`: untouched; free to claim.
|
||||||
|
/// `Host(idx)`: a single RT has exclusive ownership.
|
||||||
|
/// `Shared(idx)`: two+ RT keys map to the same tile (usually after a
|
||||||
|
/// format change without an intervening clear); the named RT is the most
|
||||||
|
/// recent owner whose format should be honored for readback.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
#[derive(Default)]
|
||||||
|
pub enum TileOwner {
|
||||||
|
#[default]
|
||||||
|
None,
|
||||||
|
Host(u32),
|
||||||
|
Shared(u32),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Bookkeeping across the 2048 EDRAM tiles. Not a GPU resource by itself —
|
||||||
|
/// tracks which render target (by index) currently owns each tile.
|
||||||
|
pub struct EdramTileBook {
|
||||||
|
tiles: Vec<TileOwner>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for EdramTileBook {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EdramTileBook {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
tiles: vec![TileOwner::None; EDRAM_TILE_COUNT],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn who_owns(&self, tile: u16) -> TileOwner {
|
||||||
|
self.tiles
|
||||||
|
.get(tile as usize)
|
||||||
|
.copied()
|
||||||
|
.unwrap_or(TileOwner::None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark `[base, base+count)` as owned by `rt_idx`. Pre-existing owners
|
||||||
|
/// in the range are demoted to `Shared` (format reinterpretation).
|
||||||
|
/// Returns the number of tiles newly claimed (not previously the same
|
||||||
|
/// owner).
|
||||||
|
pub fn claim(&mut self, base: u16, count: u16, rt_idx: u32) -> u32 {
|
||||||
|
let mut newly_claimed = 0u32;
|
||||||
|
for i in 0..(count as usize) {
|
||||||
|
let t = base as usize + i;
|
||||||
|
if t >= self.tiles.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let prev = self.tiles[t];
|
||||||
|
let already_ours = matches!(
|
||||||
|
prev,
|
||||||
|
TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
|
||||||
|
);
|
||||||
|
match prev {
|
||||||
|
TileOwner::None => {
|
||||||
|
self.tiles[t] = TileOwner::Host(rt_idx);
|
||||||
|
}
|
||||||
|
TileOwner::Host(idx) if idx == rt_idx => {
|
||||||
|
// re-claim of same RT — no-op
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Format change / shared range.
|
||||||
|
self.tiles[t] = TileOwner::Shared(rt_idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !already_ours {
|
||||||
|
newly_claimed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newly_claimed
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drop `rt_idx` from any tile it owns; tiles revert to `None` unless
|
||||||
|
/// they were `Shared(rt_idx)` (in which case they also revert to
|
||||||
|
/// `None`; the other sharer's ownership is lost — `release` is a
|
||||||
|
/// coarse "this RT is gone" operation).
|
||||||
|
pub fn release(&mut self, rt_idx: u32) {
|
||||||
|
for t in self.tiles.iter_mut() {
|
||||||
|
if matches!(
|
||||||
|
*t,
|
||||||
|
TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
|
||||||
|
) {
|
||||||
|
*t = TileOwner::None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count tiles currently assigned to any RT (Host or Shared).
|
||||||
|
pub fn occupied_count(&self) -> u32 {
|
||||||
|
self.tiles
|
||||||
|
.iter()
|
||||||
|
.filter(|o| !matches!(o, TileOwner::None))
|
||||||
|
.count() as u32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Minimal per-RT descriptor stored alongside the tile book. P5's texture
|
||||||
|
/// cache will expand this with the actual wgpu texture handle.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct RtDescriptor {
|
||||||
|
pub key: RenderTargetKey,
|
||||||
|
/// Number of times this key has been bound since creation. Rough
|
||||||
|
/// proxy for activity / hot-RT identification.
|
||||||
|
pub bind_count: u32,
|
||||||
|
/// Draw index on first bind — handy for debugging divergence.
|
||||||
|
pub first_draw_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Top-level cache: maps packed keys to small descriptors + the tile book.
|
||||||
|
pub struct RenderTargetCache {
|
||||||
|
next_idx: u32,
|
||||||
|
by_key: HashMap<u32, u32>,
|
||||||
|
descriptors: HashMap<u32, RtDescriptor>,
|
||||||
|
pub tiles: EdramTileBook,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for RenderTargetCache {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RenderTargetCache {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
next_idx: 0,
|
||||||
|
by_key: HashMap::new(),
|
||||||
|
descriptors: HashMap::new(),
|
||||||
|
tiles: EdramTileBook::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up or allocate an RT descriptor for `key`. `draw_index` is the
|
||||||
|
/// current monotonic draw counter — recorded on first insert for
|
||||||
|
/// provenance.
|
||||||
|
pub fn bind(&mut self, key: RenderTargetKey, draw_index: u32) -> u32 {
|
||||||
|
let packed = key.pack();
|
||||||
|
if let Some(&idx) = self.by_key.get(&packed) {
|
||||||
|
if let Some(d) = self.descriptors.get_mut(&idx) {
|
||||||
|
d.bind_count += 1;
|
||||||
|
}
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
let idx = self.next_idx;
|
||||||
|
self.next_idx += 1;
|
||||||
|
self.by_key.insert(packed, idx);
|
||||||
|
self.descriptors.insert(
|
||||||
|
idx,
|
||||||
|
RtDescriptor {
|
||||||
|
key,
|
||||||
|
bind_count: 1,
|
||||||
|
first_draw_index: draw_index,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
idx
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn descriptor(&self, idx: u32) -> Option<&RtDescriptor> {
|
||||||
|
self.descriptors.get(&idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.descriptors.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.descriptors.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Claim tiles for the descriptor at `rt_idx`. `height_tiles` is
|
||||||
|
/// `ceil(viewport_height_samples / 16)` — callers supply it because
|
||||||
|
/// the key itself doesn't carry height.
|
||||||
|
pub fn claim_tiles(&mut self, rt_idx: u32, height_tiles: u16) -> u32 {
|
||||||
|
if let Some(d) = self.descriptors.get(&rt_idx) {
|
||||||
|
let footprint = d.key.tile_footprint_with_height(height_tiles);
|
||||||
|
self.tiles.claim(d.key.base_tiles, footprint, rt_idx)
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn render_target_key_pack_roundtrip() {
|
||||||
|
let k = RenderTargetKey {
|
||||||
|
base_tiles: 1600,
|
||||||
|
pitch_tiles_at_32bpp: 80,
|
||||||
|
msaa_samples: MsaaSamples::X4,
|
||||||
|
is_depth: true,
|
||||||
|
resource_format: 0b1010,
|
||||||
|
};
|
||||||
|
let packed = k.pack();
|
||||||
|
let round = RenderTargetKey::unpack(packed);
|
||||||
|
assert_eq!(round, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tile_book_claim_marks_owners() {
|
||||||
|
let mut book = EdramTileBook::new();
|
||||||
|
assert_eq!(book.occupied_count(), 0);
|
||||||
|
let new_count = book.claim(100, 10, 42);
|
||||||
|
assert_eq!(new_count, 10);
|
||||||
|
assert_eq!(book.who_owns(100), TileOwner::Host(42));
|
||||||
|
assert_eq!(book.who_owns(109), TileOwner::Host(42));
|
||||||
|
assert_eq!(book.who_owns(110), TileOwner::None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tile_book_claim_demotes_to_shared() {
|
||||||
|
let mut book = EdramTileBook::new();
|
||||||
|
book.claim(100, 10, 1);
|
||||||
|
book.claim(105, 10, 2);
|
||||||
|
// Overlap: tiles 105..110 should be Shared(2); 100..105 stay Host(1);
|
||||||
|
// tiles 110..115 are fresh Host(2).
|
||||||
|
assert_eq!(book.who_owns(104), TileOwner::Host(1));
|
||||||
|
assert_eq!(book.who_owns(105), TileOwner::Shared(2));
|
||||||
|
assert_eq!(book.who_owns(110), TileOwner::Host(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tile_book_release_frees_all() {
|
||||||
|
let mut book = EdramTileBook::new();
|
||||||
|
book.claim(0, 50, 7);
|
||||||
|
book.release(7);
|
||||||
|
assert_eq!(book.occupied_count(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rt_cache_bind_is_idempotent_by_key() {
|
||||||
|
let mut cache = RenderTargetCache::new();
|
||||||
|
let k = RenderTargetKey {
|
||||||
|
base_tiles: 0,
|
||||||
|
pitch_tiles_at_32bpp: 80,
|
||||||
|
msaa_samples: MsaaSamples::X1,
|
||||||
|
is_depth: false,
|
||||||
|
resource_format: 0,
|
||||||
|
};
|
||||||
|
let a = cache.bind(k, 0);
|
||||||
|
let b = cache.bind(k, 1);
|
||||||
|
assert_eq!(a, b);
|
||||||
|
let d = cache.descriptor(a).unwrap();
|
||||||
|
assert_eq!(d.bind_count, 2);
|
||||||
|
assert_eq!(d.first_draw_index, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rt_cache_claim_tiles_tracks_footprint() {
|
||||||
|
let mut cache = RenderTargetCache::new();
|
||||||
|
let k = RenderTargetKey {
|
||||||
|
base_tiles: 0,
|
||||||
|
pitch_tiles_at_32bpp: 80, // 32bpp 1280-wide target
|
||||||
|
msaa_samples: MsaaSamples::X1,
|
||||||
|
is_depth: false,
|
||||||
|
resource_format: 0,
|
||||||
|
};
|
||||||
|
let idx = cache.bind(k, 0);
|
||||||
|
// 720 samples tall / 16 per tile = 45 rows → 80 * 45 = 3600 tiles;
|
||||||
|
// caps out at 2048. Verify clamping.
|
||||||
|
let newly = cache.claim_tiles(idx, 45);
|
||||||
|
assert_eq!(newly, 2048);
|
||||||
|
assert_eq!(cache.tiles.occupied_count(), 2048);
|
||||||
|
}
|
||||||
|
}
|
||||||
1260
crates/xenia-gpu/src/resolve.rs
Normal file
1260
crates/xenia-gpu/src/resolve.rs
Normal file
File diff suppressed because it is too large
Load Diff
169
crates/xenia-gpu/src/ring_drain.rs
Normal file
169
crates/xenia-gpu/src/ring_drain.rs
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
//! Ring-buffer drainer.
|
||||||
|
//!
|
||||||
|
//! Walks a guest PM4 ring buffer from `start_offset` forward, classifying each
|
||||||
|
//! packet via [`crate::pm4`] and stopping when it either reaches the end of
|
||||||
|
//! the window it was asked to scan, walks off a NOP-fill region, or hits a
|
||||||
|
//! malformed header.
|
||||||
|
//!
|
||||||
|
//! It does **not** execute draws — that's deferred to a later phase. Its job
|
||||||
|
//! is to (a) advance the read pointer far enough that games keep making
|
||||||
|
//! progress, and (b) surface `PM4_XE_SWAP` packets so `VdSwap` can forward
|
||||||
|
//! them to the host UI.
|
||||||
|
|
||||||
|
use xenia_memory::MemoryAccess;
|
||||||
|
|
||||||
|
use crate::pm4::{self, PacketKind};
|
||||||
|
|
||||||
|
/// Outcome of a [`drain`] call.
|
||||||
|
#[derive(Default, Debug, Clone, Copy)]
|
||||||
|
pub struct DrainResult {
|
||||||
|
/// Dword offset reached, relative to the start of the ring buffer.
|
||||||
|
pub new_offset: u32,
|
||||||
|
/// How many packets were walked in this call.
|
||||||
|
pub packets_walked: u32,
|
||||||
|
/// True if we saw `PM4_XE_SWAP` during the walk.
|
||||||
|
pub swap_seen: bool,
|
||||||
|
/// If `swap_seen`, the guest frontbuffer *physical* address written next
|
||||||
|
/// to `PM4_XE_SWAP` (dword 2 of the 4-payload packet).
|
||||||
|
pub swap_frontbuffer_phys: u32,
|
||||||
|
/// If `swap_seen`, the width written at dword 3.
|
||||||
|
pub swap_width: u32,
|
||||||
|
/// If `swap_seen`, the height written at dword 4.
|
||||||
|
pub swap_height: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Walk `max_packets` packets starting at dword offset `start_offset` in the
|
||||||
|
/// ring buffer at guest address `ring_base` of size `ring_size_dwords`.
|
||||||
|
///
|
||||||
|
/// The offset is treated modulo `ring_size_dwords`. Walking stops when:
|
||||||
|
/// - `max_packets` have been walked,
|
||||||
|
/// - a `PM4_XE_SWAP` has been consumed (the swap is reported and we stop so
|
||||||
|
/// the UI sees the frame boundary before further drain),
|
||||||
|
/// - a header's declared total size would exceed the remaining budget,
|
||||||
|
/// - the ring size is zero (drainer is a no-op).
|
||||||
|
pub fn drain<M: MemoryAccess + ?Sized>(
|
||||||
|
mem: &M,
|
||||||
|
ring_base: u32,
|
||||||
|
ring_size_dwords: u32,
|
||||||
|
start_offset: u32,
|
||||||
|
max_packets: u32,
|
||||||
|
) -> DrainResult {
|
||||||
|
if ring_size_dwords == 0 || ring_base == 0 {
|
||||||
|
return DrainResult::default();
|
||||||
|
}
|
||||||
|
let mut result = DrainResult {
|
||||||
|
new_offset: start_offset % ring_size_dwords,
|
||||||
|
..DrainResult::default()
|
||||||
|
};
|
||||||
|
let mut offset = result.new_offset;
|
||||||
|
for _ in 0..max_packets {
|
||||||
|
let header_addr = ring_base.wrapping_add(offset.wrapping_mul(4));
|
||||||
|
let header = mem.read_u32(header_addr);
|
||||||
|
let packet = pm4::decode(header);
|
||||||
|
// Refuse to walk past the ring in a single packet.
|
||||||
|
if packet.total_dwords > ring_size_dwords {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Type-3 PM4_XE_SWAP → record payload and stop.
|
||||||
|
if let PacketKind::Type3 { opcode, .. } = packet.kind
|
||||||
|
&& opcode == pm4::PM4_XE_SWAP {
|
||||||
|
// Payload layout (from canary VdSwap_entry):
|
||||||
|
// [0] XE_SWAP header
|
||||||
|
// [1] kSwapSignature ("XNEX" = 0x584E4558)
|
||||||
|
// [2] frontbuffer physical address
|
||||||
|
// [3] width
|
||||||
|
// [4] height
|
||||||
|
let payload = |i: u32| {
|
||||||
|
let addr =
|
||||||
|
ring_base.wrapping_add(((offset + i) % ring_size_dwords).wrapping_mul(4));
|
||||||
|
mem.read_u32(addr)
|
||||||
|
};
|
||||||
|
result.swap_seen = true;
|
||||||
|
result.swap_frontbuffer_phys = payload(2);
|
||||||
|
result.swap_width = payload(3);
|
||||||
|
result.swap_height = payload(4);
|
||||||
|
offset = (offset + packet.total_dwords) % ring_size_dwords;
|
||||||
|
result.new_offset = offset;
|
||||||
|
result.packets_walked += 1;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
offset = (offset + packet.total_dwords) % ring_size_dwords;
|
||||||
|
result.new_offset = offset;
|
||||||
|
result.packets_walked += 1;
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use xenia_memory::GuestMemory;
|
||||||
|
use xenia_memory::page_table::MemoryProtect;
|
||||||
|
|
||||||
|
fn build_mem() -> GuestMemory {
|
||||||
|
let mut mem = GuestMemory::new().unwrap();
|
||||||
|
let rw = MemoryProtect::READ | MemoryProtect::WRITE;
|
||||||
|
mem.alloc(0x4000_0000, 0x1000, rw).unwrap();
|
||||||
|
mem
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_dword(mem: &GuestMemory, addr: u32, val: u32) {
|
||||||
|
mem.write_u32(addr, val);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn walks_nops_until_budget_exhausted() {
|
||||||
|
let mut mem = build_mem();
|
||||||
|
// Fill 10 dwords with Type-2 NOPs.
|
||||||
|
for i in 0..10 {
|
||||||
|
write_dword(&mut mem, 0x4000_0000 + i * 4, 0x8000_0000);
|
||||||
|
}
|
||||||
|
let r = drain(&mem, 0x4000_0000, 0x400, 0, 5);
|
||||||
|
assert_eq!(r.packets_walked, 5);
|
||||||
|
assert_eq!(r.new_offset, 5);
|
||||||
|
assert!(!r.swap_seen);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stops_at_swap_and_reports_payload() {
|
||||||
|
let mut mem = build_mem();
|
||||||
|
// Two NOPs, then a PM4_XE_SWAP packet.
|
||||||
|
write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
|
||||||
|
write_dword(&mut mem, 0x4000_0004, 0x8000_0000);
|
||||||
|
// MakePacketType3(PM4_XE_SWAP, 4) → (3<<30) | (3<<16) | (0x64<<8)
|
||||||
|
let swap_hdr = (3u32 << 30) | (3u32 << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
|
||||||
|
write_dword(&mut mem, 0x4000_0008, swap_hdr);
|
||||||
|
write_dword(&mut mem, 0x4000_000C, pm4::SWAP_SIGNATURE);
|
||||||
|
write_dword(&mut mem, 0x4000_0010, 0xDEAD_F000); // frontbuffer phys
|
||||||
|
write_dword(&mut mem, 0x4000_0014, 1280);
|
||||||
|
write_dword(&mut mem, 0x4000_0018, 720);
|
||||||
|
let r = drain(&mem, 0x4000_0000, 0x400, 0, 16);
|
||||||
|
assert!(r.swap_seen);
|
||||||
|
assert_eq!(r.swap_frontbuffer_phys, 0xDEAD_F000);
|
||||||
|
assert_eq!(r.swap_width, 1280);
|
||||||
|
assert_eq!(r.swap_height, 720);
|
||||||
|
assert_eq!(r.packets_walked, 3);
|
||||||
|
assert_eq!(r.new_offset, 7); // 2 NOPs (1 dword each) + 5-dword swap = 7
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn wraps_around_ring() {
|
||||||
|
let mut mem = build_mem();
|
||||||
|
// Ring size = 4 dwords. Start at offset 3 (last dword). Write a NOP
|
||||||
|
// there, then the walker should wrap to offset 0.
|
||||||
|
write_dword(&mut mem, 0x4000_000C, 0x8000_0000);
|
||||||
|
write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
|
||||||
|
let r = drain(&mem, 0x4000_0000, 4, 3, 2);
|
||||||
|
assert_eq!(r.packets_walked, 2);
|
||||||
|
assert_eq!(r.new_offset, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn zero_ring_size_is_noop() {
|
||||||
|
let mem = build_mem();
|
||||||
|
let r = drain(&mem, 0x4000_0000, 0, 0, 10);
|
||||||
|
assert_eq!(r.packets_walked, 0);
|
||||||
|
assert_eq!(r.new_offset, 0);
|
||||||
|
assert!(!r.swap_seen);
|
||||||
|
}
|
||||||
|
}
|
||||||
123
crates/xenia-gpu/src/ring_view.rs
Normal file
123
crates/xenia-gpu/src/ring_view.rs
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
//! Primary ring buffer view.
|
||||||
|
//!
|
||||||
|
//! Games allocate a ring buffer in physical memory (via
|
||||||
|
//! `MmAllocatePhysicalMemoryEx` with WRITE_COMBINE), then hand the base
|
||||||
|
//! address + log2(size) to `VdInitializeRingBuffer`. They subsequently push
|
||||||
|
//! PM4 packets into it, advancing the write-pointer by writing to a GPU
|
||||||
|
//! register (`CP_RB_WPTR`) or via kernel-call shims.
|
||||||
|
//!
|
||||||
|
//! The GPU consumes packets from `read_offset_dwords` up to (but not past)
|
||||||
|
//! the write pointer. After consuming enough bytes it writes `read_offset`
|
||||||
|
//! into the guest-memory address registered by `VdEnableRingBufferRPtrWriteBack`
|
||||||
|
//! so the game can know how much of the ring has been consumed.
|
||||||
|
|
||||||
|
/// Tracks the primary ring buffer as set up by the guest.
|
||||||
|
#[derive(Debug, Clone, Copy, Default)]
|
||||||
|
pub struct RingBufferView {
|
||||||
|
/// Guest physical/virtual base address. `0` means uninitialized.
|
||||||
|
pub base: u32,
|
||||||
|
/// Size of the ring in dwords. `0` means uninitialized.
|
||||||
|
pub size_dwords: u32,
|
||||||
|
/// Dword offset the GPU has consumed up to (relative to `base`).
|
||||||
|
pub read_offset_dwords: u32,
|
||||||
|
/// Dword offset the guest has last written into (relative to `base`).
|
||||||
|
/// Updated either by an MMIO write to `CP_RB_WPTR` or by the kernel
|
||||||
|
/// (`VdSwap` is a hint — the game reserves a 64-dword slot in the ring
|
||||||
|
/// for it).
|
||||||
|
pub write_offset_dwords: u32,
|
||||||
|
/// Guest address where we mirror `read_offset_dwords` each time we make
|
||||||
|
/// progress. `0` if the game never called `VdEnableRingBufferRPtrWriteBack`.
|
||||||
|
pub rptr_writeback_addr: u32,
|
||||||
|
/// Write-back block granularity in dwords (from the `log2` arg to
|
||||||
|
/// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so
|
||||||
|
/// we don't actually use this for scheduling — kept for observability.
|
||||||
|
pub rptr_writeback_block_dwords: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RingBufferView {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if the guest has provided a base + size.
|
||||||
|
pub fn is_initialized(&self) -> bool {
|
||||||
|
self.base != 0 && self.size_dwords != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if there is pending unread data to consume.
|
||||||
|
pub fn has_pending(&self) -> bool {
|
||||||
|
self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Number of dwords we can consume without wrapping past the write ptr.
|
||||||
|
pub fn pending_dwords(&self) -> u32 {
|
||||||
|
if !self.is_initialized() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if self.write_offset_dwords >= self.read_offset_dwords {
|
||||||
|
self.write_offset_dwords - self.read_offset_dwords
|
||||||
|
} else {
|
||||||
|
// write has wrapped — we can read up to the end of the ring.
|
||||||
|
self.size_dwords - self.read_offset_dwords
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Advance the read pointer by `dwords`, wrapping at `size_dwords`.
|
||||||
|
pub fn advance_read(&mut self, dwords: u32) {
|
||||||
|
if self.size_dwords == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
self.read_offset_dwords =
|
||||||
|
(self.read_offset_dwords + dwords) % self.size_dwords;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Guest address for the dword at relative offset `i` from the current
|
||||||
|
/// read pointer. `None` if uninitialized.
|
||||||
|
pub fn addr_at_offset(&self, offset_dwords: u32) -> Option<u32> {
|
||||||
|
if !self.is_initialized() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords;
|
||||||
|
Some(self.base.wrapping_add(off.wrapping_mul(4)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn uninitialized_view_reports_empty() {
|
||||||
|
let v = RingBufferView::new();
|
||||||
|
assert!(!v.is_initialized());
|
||||||
|
assert!(!v.has_pending());
|
||||||
|
assert_eq!(v.pending_dwords(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn wrap_around_arithmetic() {
|
||||||
|
let mut v = RingBufferView::new();
|
||||||
|
v.base = 0x4000_0000;
|
||||||
|
v.size_dwords = 16;
|
||||||
|
v.read_offset_dwords = 14;
|
||||||
|
v.write_offset_dwords = 2; // wrapped
|
||||||
|
|
||||||
|
// We can only read to end-of-ring in one chunk.
|
||||||
|
assert_eq!(v.pending_dwords(), 2);
|
||||||
|
v.advance_read(2);
|
||||||
|
assert_eq!(v.read_offset_dwords, 0);
|
||||||
|
// Now unwrapped, 2 more to go.
|
||||||
|
assert_eq!(v.pending_dwords(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn addr_at_offset_wraps() {
|
||||||
|
let mut v = RingBufferView::new();
|
||||||
|
v.base = 0x4000_0000;
|
||||||
|
v.size_dwords = 4;
|
||||||
|
v.read_offset_dwords = 3;
|
||||||
|
assert_eq!(v.addr_at_offset(0), Some(0x4000_000C));
|
||||||
|
assert_eq!(v.addr_at_offset(1), Some(0x4000_0000));
|
||||||
|
assert_eq!(v.addr_at_offset(2), Some(0x4000_0004));
|
||||||
|
}
|
||||||
|
}
|
||||||
350
crates/xenia-gpu/src/shader_metrics.rs
Normal file
350
crates/xenia-gpu/src/shader_metrics.rs
Normal file
@@ -0,0 +1,350 @@
|
|||||||
|
//! Host-side static analysis over a [`ParsedShader`], emitted once per unique
|
||||||
|
//! shader blob. Produces the observability the plan's P3b/P3c sections call
|
||||||
|
//! for (`gpu.shader.interpret{stage,kind}` + `gpu.shader.reject{reason}`), so
|
||||||
|
//! the HUD can show when a game is reaching ops the WGSL interpreter falls
|
||||||
|
//! back on.
|
||||||
|
//!
|
||||||
|
//! Analysis is intentionally cheap: it scans each exec clause's instruction
|
||||||
|
//! triples, classifies them as ALU / vertex-fetch / texture-fetch using the
|
||||||
|
//! owning clause's sequence bitmap, and bumps counters accordingly. No GPU
|
||||||
|
//! readback is required — `reject` reasons are inferred from opcode values
|
||||||
|
//! alone.
|
||||||
|
|
||||||
|
use metrics::counter;
|
||||||
|
|
||||||
|
use crate::ucode::alu::{decode_alu, sop, vop};
|
||||||
|
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||||
|
use crate::ucode::fetch::{FetchInstruction, decode_fetch};
|
||||||
|
use crate::ucode::ParsedShader;
|
||||||
|
|
||||||
|
/// Walk `parsed` once and emit `gpu.shader.interpret` + `gpu.shader.reject`
|
||||||
|
/// counters. `stage` should be `"vs"` or `"ps"`.
|
||||||
|
pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||||
|
let mut alu_count: u64 = 0;
|
||||||
|
let mut vfetch_count: u64 = 0;
|
||||||
|
let mut tfetch_count: u64 = 0;
|
||||||
|
let mut rejects: Vec<(&'static str, u64)> = Vec::new();
|
||||||
|
|
||||||
|
let mut features: Vec<&'static str> = Vec::new();
|
||||||
|
for clause in &parsed.cf {
|
||||||
|
match clause {
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address,
|
||||||
|
count,
|
||||||
|
sequence,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
for i in 0..(*count as usize) {
|
||||||
|
let triple_idx = *address as usize + i;
|
||||||
|
let base = triple_idx * 3;
|
||||||
|
if base + 2 >= parsed.instructions.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let words = [
|
||||||
|
parsed.instructions[base],
|
||||||
|
parsed.instructions[base + 1],
|
||||||
|
parsed.instructions[base + 2],
|
||||||
|
];
|
||||||
|
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
||||||
|
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||||
|
if is_fetch {
|
||||||
|
match decode_fetch(words) {
|
||||||
|
FetchInstruction::Vertex(_) => vfetch_count += 1,
|
||||||
|
FetchInstruction::Texture(tf) => {
|
||||||
|
tfetch_count += 1;
|
||||||
|
match tf.dimension {
|
||||||
|
0 => mark_feature(&mut features, "tfetch_1d"),
|
||||||
|
2 => mark_feature(&mut features, "tfetch_3d"),
|
||||||
|
3 => mark_feature(&mut features, "tfetch_cube"),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
if tf.dimension != 1 {
|
||||||
|
bump(&mut rejects, "texfetch_dimension");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FetchInstruction::Unknown { .. } => {
|
||||||
|
bump(&mut rejects, "fetch_unknown");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
alu_count += 1;
|
||||||
|
let alu = decode_alu(words);
|
||||||
|
if !vec_op_supported(alu.vector_opcode) {
|
||||||
|
bump(&mut rejects, "alu_vec_unsupported");
|
||||||
|
}
|
||||||
|
if !scl_op_supported(alu.scalar_opcode) {
|
||||||
|
bump(&mut rejects, "alu_scl_unsupported");
|
||||||
|
}
|
||||||
|
// Feature-of-interest detection for future phases.
|
||||||
|
// Transcendentals + kill + setp + cube/max4 are the
|
||||||
|
// high-value signals: they tell us which of the
|
||||||
|
// deferred capabilities Sylpheed actually exercises.
|
||||||
|
match alu.vector_opcode {
|
||||||
|
v if v == vop::CUBE => mark_feature(&mut features, "vec_cube"),
|
||||||
|
v if v == vop::MAX4 => mark_feature(&mut features, "vec_max4"),
|
||||||
|
v if v == vop::KILL_EQ
|
||||||
|
|| v == vop::KILL_GT
|
||||||
|
|| v == vop::KILL_GE
|
||||||
|
|| v == vop::KILL_NE =>
|
||||||
|
{
|
||||||
|
mark_feature(&mut features, "vec_kill");
|
||||||
|
}
|
||||||
|
v if v == vop::CND_EQ
|
||||||
|
|| v == vop::CND_GE
|
||||||
|
|| v == vop::CND_GT =>
|
||||||
|
{
|
||||||
|
mark_feature(&mut features, "vec_cnd");
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
match alu.scalar_opcode {
|
||||||
|
s if s == sop::EXP
|
||||||
|
|| s == sop::LOG
|
||||||
|
|| s == sop::LOGC
|
||||||
|
|| s == sop::SIN
|
||||||
|
|| s == sop::COS =>
|
||||||
|
{
|
||||||
|
mark_feature(&mut features, "scl_transcendental");
|
||||||
|
}
|
||||||
|
s if s == sop::RSQ
|
||||||
|
|| s == sop::RSQC
|
||||||
|
|| s == sop::RSQF
|
||||||
|
|| s == sop::SQRT =>
|
||||||
|
{
|
||||||
|
mark_feature(&mut features, "scl_sqrt_family");
|
||||||
|
}
|
||||||
|
s if s == sop::SETP_EQ
|
||||||
|
|| s == sop::SETP_NE
|
||||||
|
|| s == sop::SETP_GT
|
||||||
|
|| s == sop::SETP_GE
|
||||||
|
|| s == sop::SETP_INV
|
||||||
|
|| s == sop::SETP_POP
|
||||||
|
|| s == sop::SETP_CLR
|
||||||
|
|| s == sop::SETP_RSTR =>
|
||||||
|
{
|
||||||
|
mark_feature(&mut features, "scl_setp");
|
||||||
|
}
|
||||||
|
s if s == sop::KILLS_EQ
|
||||||
|
|| s == sop::KILLS_GT
|
||||||
|
|| s == sop::KILLS_GE
|
||||||
|
|| s == sop::KILLS_NE
|
||||||
|
|| s == sop::KILLS_ONE =>
|
||||||
|
{
|
||||||
|
mark_feature(&mut features, "scl_kills");
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
if alu.predicated {
|
||||||
|
mark_feature(&mut features, "alu_predicated");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::LoopStart { .. }
|
||||||
|
| ControlFlowInstruction::LoopEnd { .. } => {
|
||||||
|
mark_feature(&mut features, "cf_loop");
|
||||||
|
bump(&mut rejects, "cf_loop");
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::CondJmp { .. } => {
|
||||||
|
mark_feature(&mut features, "cf_cond_jmp");
|
||||||
|
bump(&mut rejects, "cf_cond_jmp");
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
|
||||||
|
mark_feature(&mut features, "cf_call_return");
|
||||||
|
bump(&mut rejects, "cf_call_return");
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::Unknown { .. } => {
|
||||||
|
bump(&mut rejects, "cf_unknown");
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "alu")
|
||||||
|
.increment(alu_count);
|
||||||
|
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "vfetch")
|
||||||
|
.increment(vfetch_count);
|
||||||
|
counter!("gpu.shader.interpret", "stage" => stage, "kind" => "tfetch")
|
||||||
|
.increment(tfetch_count);
|
||||||
|
for (reason, n) in rejects {
|
||||||
|
counter!("gpu.shader.reject", "stage" => stage, "reason" => reason).increment(n);
|
||||||
|
}
|
||||||
|
for name in features {
|
||||||
|
counter!("gpu.feature.used", "stage" => stage, "name" => name).increment(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
|
||||||
|
if !buf.contains(&name) {
|
||||||
|
buf.push(name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bump(buf: &mut Vec<(&'static str, u64)>, reason: &'static str) {
|
||||||
|
for entry in buf.iter_mut() {
|
||||||
|
if entry.0 == reason {
|
||||||
|
entry.1 += 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buf.push((reason, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
fn vec_op_supported(op: u8) -> bool {
|
||||||
|
matches!(
|
||||||
|
op,
|
||||||
|
vop::ADD
|
||||||
|
| vop::MUL
|
||||||
|
| vop::MAX
|
||||||
|
| vop::MIN
|
||||||
|
| vop::SEQ
|
||||||
|
| vop::SGT
|
||||||
|
| vop::SGE
|
||||||
|
| vop::SNE
|
||||||
|
| vop::FRC
|
||||||
|
| vop::TRUNC
|
||||||
|
| vop::FLOOR
|
||||||
|
| vop::MAD
|
||||||
|
| vop::CND_EQ
|
||||||
|
| vop::CND_GE
|
||||||
|
| vop::CND_GT
|
||||||
|
| vop::DOT4
|
||||||
|
| vop::DOT3
|
||||||
|
| vop::DOT2_ADD
|
||||||
|
| vop::MAX4
|
||||||
|
| vop::KILL_EQ
|
||||||
|
| vop::KILL_GT
|
||||||
|
| vop::KILL_GE
|
||||||
|
| vop::KILL_NE
|
||||||
|
| vop::DST
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scl_op_supported(op: u8) -> bool {
|
||||||
|
matches!(
|
||||||
|
op,
|
||||||
|
sop::ADDS
|
||||||
|
| sop::ADDS_PREV
|
||||||
|
| sop::MULS
|
||||||
|
| sop::MULS_PREV
|
||||||
|
| sop::MAXS
|
||||||
|
| sop::MINS
|
||||||
|
| sop::SEQS
|
||||||
|
| sop::SGTS
|
||||||
|
| sop::SGES
|
||||||
|
| sop::SNES
|
||||||
|
| sop::FRCS
|
||||||
|
| sop::TRUNCS
|
||||||
|
| sop::FLOORS
|
||||||
|
| sop::EXP
|
||||||
|
| sop::LOG
|
||||||
|
| sop::LOGC
|
||||||
|
| sop::RCP
|
||||||
|
| sop::RCPC
|
||||||
|
| sop::RCPF
|
||||||
|
| sop::RSQ
|
||||||
|
| sop::RSQC
|
||||||
|
| sop::RSQF
|
||||||
|
| sop::SQRT
|
||||||
|
| sop::SUBS
|
||||||
|
| sop::SUBS_PREV
|
||||||
|
| sop::SETP_EQ
|
||||||
|
| sop::SETP_NE
|
||||||
|
| sop::SETP_GT
|
||||||
|
| sop::SETP_GE
|
||||||
|
| sop::SETP_INV
|
||||||
|
| sop::SETP_POP
|
||||||
|
| sop::SETP_CLR
|
||||||
|
| sop::SETP_RSTR
|
||||||
|
| sop::KILLS_EQ
|
||||||
|
| sop::KILLS_GT
|
||||||
|
| sop::KILLS_GE
|
||||||
|
| sop::KILLS_NE
|
||||||
|
| sop::KILLS_ONE
|
||||||
|
| sop::SIN
|
||||||
|
| sop::COS
|
||||||
|
| sop::RETAIN_PREV
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::ucode::alu::{sop, vop};
|
||||||
|
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||||
|
|
||||||
|
/// Build a minimal `ParsedShader` with one `Exec` clause containing
|
||||||
|
/// `count` ALU triples and assert the `alu` counter path works.
|
||||||
|
#[test]
|
||||||
|
fn emit_for_runs_on_synthetic_shader() {
|
||||||
|
let alu_w2 = (vop::ADD as u32) | ((sop::ADDS as u32) << 6) | (0xF << 12);
|
||||||
|
let shader = ParsedShader {
|
||||||
|
cf: vec![
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address: 0,
|
||||||
|
count: 2,
|
||||||
|
sequence: 0, // all ALU (no is-fetch bits)
|
||||||
|
is_end: false,
|
||||||
|
predicated: false,
|
||||||
|
predicate_condition: false,
|
||||||
|
},
|
||||||
|
ControlFlowInstruction::Exit,
|
||||||
|
],
|
||||||
|
instructions: vec![0, 0, alu_w2, 0, 0, alu_w2],
|
||||||
|
};
|
||||||
|
// Just smoke: doesn't panic. Counters are validated via metrics
|
||||||
|
// exporters elsewhere; we only assert this doesn't throw on a
|
||||||
|
// well-formed ParsedShader.
|
||||||
|
emit_for(&shader, "vs");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// P8: a shader containing `LoopStart` should mark `cf_loop` as used
|
||||||
|
/// so the HUD can surface which deferred feature a game triggers.
|
||||||
|
#[test]
|
||||||
|
fn feature_detection_flags_loops_and_kills() {
|
||||||
|
let kill_alu_w2 =
|
||||||
|
(vop::KILL_EQ as u32) | ((sop::RETAIN_PREV as u32) << 6) | (0xF << 12);
|
||||||
|
let shader = ParsedShader {
|
||||||
|
cf: vec![
|
||||||
|
ControlFlowInstruction::LoopStart {
|
||||||
|
address: 0,
|
||||||
|
loop_id: 0,
|
||||||
|
},
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address: 0,
|
||||||
|
count: 1,
|
||||||
|
sequence: 0,
|
||||||
|
is_end: true,
|
||||||
|
predicated: false,
|
||||||
|
predicate_condition: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
instructions: vec![0, 0, kill_alu_w2],
|
||||||
|
};
|
||||||
|
// Smoke: emits cleanly.
|
||||||
|
emit_for(&shader, "ps");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unsupported_ops_classified_as_rejects() {
|
||||||
|
// Opcode 63 is outside our supported sets for both pipes.
|
||||||
|
let alu_w2 = 63u32 | (63u32 << 6) | (0xF << 12);
|
||||||
|
let shader = ParsedShader {
|
||||||
|
cf: vec![
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address: 0,
|
||||||
|
count: 1,
|
||||||
|
sequence: 0,
|
||||||
|
is_end: true,
|
||||||
|
predicated: false,
|
||||||
|
predicate_condition: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
instructions: vec![0, 0, alu_w2],
|
||||||
|
};
|
||||||
|
// Again: smoke — but also confirm our static tables reject op 63.
|
||||||
|
assert!(!vec_op_supported(63));
|
||||||
|
assert!(!scl_op_supported(63));
|
||||||
|
emit_for(&shader, "ps");
|
||||||
|
}
|
||||||
|
}
|
||||||
36
crates/xenia-gpu/src/shaders/mod.rs
Normal file
36
crates/xenia-gpu/src/shaders/mod.rs
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
//! Embedded WGSL shader sources used by the host pipeline.
|
||||||
|
|
||||||
|
/// Xenos uber-shader scaffold (P3). See the comment at the top of
|
||||||
|
/// [`XENOS_INTERP_WGSL`] for scope/limits and the plan's P3b target state.
|
||||||
|
pub const XENOS_INTERP_WGSL: &str = include_str!("xenos_interp.wgsl");
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Parsing through naga validates the shader against WGSL spec + wgpu's
|
||||||
|
/// type system. We don't need a full pipeline to catch typos and layout
|
||||||
|
/// mistakes — this test is fast and catches regressions at `cargo test`
|
||||||
|
/// time.
|
||||||
|
#[test]
|
||||||
|
fn xenos_interp_wgsl_parses() {
|
||||||
|
let module = naga::front::wgsl::parse_str(XENOS_INTERP_WGSL)
|
||||||
|
.expect("xenos_interp.wgsl must parse cleanly");
|
||||||
|
// Sanity: we declared two entry points.
|
||||||
|
assert!(!module.entry_points.is_empty());
|
||||||
|
assert!(
|
||||||
|
module
|
||||||
|
.entry_points
|
||||||
|
.iter()
|
||||||
|
.any(|e| e.name == "vs_main" && e.stage == naga::ShaderStage::Vertex),
|
||||||
|
"missing vs_main entry"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
module
|
||||||
|
.entry_points
|
||||||
|
.iter()
|
||||||
|
.any(|e| e.name == "fs_main" && e.stage == naga::ShaderStage::Fragment),
|
||||||
|
"missing fs_main entry"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
974
crates/xenia-gpu/src/shaders/xenos_interp.wgsl
Normal file
974
crates/xenia-gpu/src/shaders/xenos_interp.wgsl
Normal file
@@ -0,0 +1,974 @@
|
|||||||
|
// xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
|
||||||
|
//
|
||||||
|
// Bindings (stable across P3b milestones):
|
||||||
|
// @group(0) @binding(0) draw_ctx (uniform, 16 B — XenosDrawConstants)
|
||||||
|
// @group(0) @binding(1) xenos_consts (uniform, ~9.2 KB — XenosConstants)
|
||||||
|
// @group(0) @binding(2) vs_ucode (storage<read>, packed VS shader)
|
||||||
|
// @group(0) @binding(3) ps_ucode (storage<read>, packed PS shader)
|
||||||
|
// @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
|
||||||
|
//
|
||||||
|
// Packed shader layout (both vs_ucode & ps_ucode):
|
||||||
|
// [0] = cf_count
|
||||||
|
// [1 .. 1 + cf_count*3] = CF table: (kind, primary, aux) × cf_count
|
||||||
|
// [1 + cf_count*3 ..] = instruction triples (3 dwords each)
|
||||||
|
//
|
||||||
|
// M3 state (this file): CF walker + operand decode helpers + register file
|
||||||
|
// scaffold are complete. ALU / fetch bodies are still stubs that fall back
|
||||||
|
// to the procedural-circle visualisation; M4-M7 fill them in.
|
||||||
|
|
||||||
|
struct XenosDrawConstants {
|
||||||
|
draw_index: u32,
|
||||||
|
vertex_count: u32,
|
||||||
|
prim_kind: u32,
|
||||||
|
_pad: u32,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct XenosConstants {
|
||||||
|
alu: array<vec4<f32>, 512>,
|
||||||
|
fetch: array<u32, 256>,
|
||||||
|
bool_consts: array<u32, 8>,
|
||||||
|
loop_consts: array<u32, 32>,
|
||||||
|
};
|
||||||
|
|
||||||
|
@group(0) @binding(0) var<uniform> draw_ctx : XenosDrawConstants;
|
||||||
|
// `xenos_consts` is a read-only storage buffer (not uniform) because the
|
||||||
|
// block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
|
||||||
|
// address space requires 16-byte element stride, which would triple the
|
||||||
|
// allocation; storage accepts the natural 4-byte stride.
|
||||||
|
@group(0) @binding(1) var<storage, read> xenos_consts : XenosConstants;
|
||||||
|
@group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
|
||||||
|
@group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
|
||||||
|
@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
|
||||||
|
|
||||||
|
// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
|
||||||
|
// all texture slots; the P5 texture cache will replace this with per-slot
|
||||||
|
// bindings.
|
||||||
|
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||||
|
@group(1) @binding(1) var xenos_samp : sampler;
|
||||||
|
|
||||||
|
// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
|
||||||
|
const CF_KIND_EXEC: u32 = 0u;
|
||||||
|
const CF_KIND_EXEC_END: u32 = 1u;
|
||||||
|
const CF_KIND_ALLOC: u32 = 2u;
|
||||||
|
const CF_KIND_EXIT: u32 = 3u;
|
||||||
|
const CF_KIND_LOOP_START: u32 = 4u;
|
||||||
|
const CF_KIND_LOOP_END: u32 = 5u;
|
||||||
|
const CF_KIND_COND_JMP: u32 = 6u;
|
||||||
|
const CF_KIND_COND_CALL: u32 = 7u;
|
||||||
|
const CF_KIND_RETURN: u32 = 8u;
|
||||||
|
const CF_KIND_UNKNOWN: u32 = 15u;
|
||||||
|
|
||||||
|
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
|
||||||
|
const ALLOC_KIND_POSITION: u32 = 0u;
|
||||||
|
const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
|
||||||
|
const ALLOC_KIND_COLORS: u32 = 2u;
|
||||||
|
|
||||||
|
// Per-invocation Xenos register file + scalar `ps` + predicate.
|
||||||
|
var<private> registers: array<vec4<f32>, 128>;
|
||||||
|
var<private> ps: f32;
|
||||||
|
var<private> predicate: bool;
|
||||||
|
|
||||||
|
// Currently-active export alloc kind; set by Alloc clauses.
|
||||||
|
var<private> current_alloc: u32;
|
||||||
|
|
||||||
|
// P3c additions:
|
||||||
|
// `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
|
||||||
|
// interpreter exits. (`discard` inside a helper function is
|
||||||
|
// allowed in WGSL, but keeping it at the entry level makes
|
||||||
|
// control flow easier to read.)
|
||||||
|
// `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
|
||||||
|
// clauses. Xenos supports up to 4 nested loops.
|
||||||
|
// `reject_mask` — bitfield of op categories we failed to interpret, so the
|
||||||
|
// PS fallback color + host-side diagnostics can surface it.
|
||||||
|
var<private> kill_flag: bool;
|
||||||
|
var<private> loop_depth: u32;
|
||||||
|
var<private> loop_counters: array<u32, 4>;
|
||||||
|
var<private> loop_starts: array<u32, 4>;
|
||||||
|
var<private> reject_mask: u32;
|
||||||
|
|
||||||
|
const REJECT_ALU_VEC: u32 = 1u;
|
||||||
|
const REJECT_ALU_SCL: u32 = 2u;
|
||||||
|
const REJECT_TEX_NON2D: u32 = 4u;
|
||||||
|
const REJECT_VFETCH_FMT: u32 = 8u;
|
||||||
|
const REJECT_CF_JUMP: u32 = 16u;
|
||||||
|
const REJECT_CF_CALL: u32 = 32u;
|
||||||
|
const REJECT_LOOP_OVERFLOW:u32 = 64u;
|
||||||
|
|
||||||
|
struct VsOut {
|
||||||
|
@builtin(position) position: vec4<f32>,
|
||||||
|
@location(0) color: vec4<f32>,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FsOut {
|
||||||
|
@location(0) color0: vec4<f32>,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.
|
||||||
|
|
||||||
|
fn vs_cf_count() -> u32 { return vs_ucode[0]; }
|
||||||
|
fn ps_cf_count() -> u32 { return ps_ucode[0]; }
|
||||||
|
|
||||||
|
fn vs_cf_kind(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
|
||||||
|
fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
|
||||||
|
fn vs_cf_aux(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 2u]; }
|
||||||
|
fn ps_cf_kind(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
|
||||||
|
fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
|
||||||
|
fn ps_cf_aux(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 2u]; }
|
||||||
|
|
||||||
|
fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
|
||||||
|
fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }
|
||||||
|
|
||||||
|
// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
|
||||||
|
fn vs_instr_dword(t: u32, which: u32) -> u32 {
|
||||||
|
return vs_ucode[vs_instr_base() + t * 3u + which];
|
||||||
|
}
|
||||||
|
fn ps_instr_dword(t: u32, which: u32) -> u32 {
|
||||||
|
return ps_ucode[ps_instr_base() + t * 3u + which];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
|
||||||
|
|
||||||
|
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
|
||||||
|
// Swizzle is 8 bits: 2 bits per output lane, xyzw order.
|
||||||
|
let sx = (swizzle >> 0u) & 3u;
|
||||||
|
let sy = (swizzle >> 2u) & 3u;
|
||||||
|
let sz = (swizzle >> 4u) & 3u;
|
||||||
|
let sw = (swizzle >> 6u) & 3u;
|
||||||
|
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
|
||||||
|
var r = v;
|
||||||
|
if take_abs {
|
||||||
|
r = abs(r);
|
||||||
|
}
|
||||||
|
if negate {
|
||||||
|
r = -r;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
|
||||||
|
if idx >= 128u {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let prev = registers[idx];
|
||||||
|
var out = prev;
|
||||||
|
if (mask & 1u) != 0u { out.x = value.x; }
|
||||||
|
if (mask & 2u) != 0u { out.y = value.y; }
|
||||||
|
if (mask & 4u) != 0u { out.z = value.z; }
|
||||||
|
if (mask & 8u) != 0u { out.w = value.w; }
|
||||||
|
registers[idx] = out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
|
||||||
|
// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
|
||||||
|
// `xenia_gpu::ucode::alu::{vop,sop}`).
|
||||||
|
const VOP_ADD: u32 = 0u;
|
||||||
|
const VOP_MUL: u32 = 1u;
|
||||||
|
const VOP_MAX: u32 = 2u;
|
||||||
|
const VOP_MIN: u32 = 3u;
|
||||||
|
const VOP_SEQ: u32 = 4u;
|
||||||
|
const VOP_SGT: u32 = 5u;
|
||||||
|
const VOP_SGE: u32 = 6u;
|
||||||
|
const VOP_SNE: u32 = 7u;
|
||||||
|
const VOP_FRC: u32 = 8u;
|
||||||
|
const VOP_TRUNC: u32 = 9u;
|
||||||
|
const VOP_FLOOR: u32 = 10u;
|
||||||
|
const VOP_MAD: u32 = 11u;
|
||||||
|
const VOP_CND_EQ: u32 = 12u;
|
||||||
|
const VOP_CND_GE: u32 = 13u;
|
||||||
|
const VOP_CND_GT: u32 = 14u;
|
||||||
|
const VOP_DP4: u32 = 15u;
|
||||||
|
const VOP_DP3: u32 = 16u;
|
||||||
|
const VOP_DP2_ADD: u32 = 17u;
|
||||||
|
const VOP_CUBE: u32 = 18u;
|
||||||
|
const VOP_MAX4: u32 = 19u;
|
||||||
|
const VOP_KILL_EQ: u32 = 24u;
|
||||||
|
const VOP_KILL_GT: u32 = 25u;
|
||||||
|
const VOP_KILL_GE: u32 = 26u;
|
||||||
|
const VOP_KILL_NE: u32 = 27u;
|
||||||
|
const VOP_DST: u32 = 28u;
|
||||||
|
const VOP_MAX_A: u32 = 29u;
|
||||||
|
|
||||||
|
const SOP_ADDS: u32 = 0u;
|
||||||
|
const SOP_ADDS_PREV: u32 = 1u;
|
||||||
|
const SOP_MULS: u32 = 2u;
|
||||||
|
const SOP_MULS_PREV: u32 = 3u;
|
||||||
|
const SOP_MAXS: u32 = 5u;
|
||||||
|
const SOP_MINS: u32 = 6u;
|
||||||
|
const SOP_SEQS: u32 = 7u;
|
||||||
|
const SOP_SGTS: u32 = 8u;
|
||||||
|
const SOP_SGES: u32 = 9u;
|
||||||
|
const SOP_SNES: u32 = 10u;
|
||||||
|
const SOP_FRCS: u32 = 11u;
|
||||||
|
const SOP_TRUNCS: u32 = 12u;
|
||||||
|
const SOP_FLOORS: u32 = 13u;
|
||||||
|
const SOP_EXP: u32 = 14u;
|
||||||
|
const SOP_LOGC: u32 = 15u;
|
||||||
|
const SOP_LOG: u32 = 16u;
|
||||||
|
const SOP_RCPC: u32 = 17u;
|
||||||
|
const SOP_RCPF: u32 = 18u;
|
||||||
|
const SOP_RCP: u32 = 19u;
|
||||||
|
const SOP_RSQC: u32 = 20u;
|
||||||
|
const SOP_RSQF: u32 = 21u;
|
||||||
|
const SOP_RSQ: u32 = 22u;
|
||||||
|
const SOP_SUBS: u32 = 25u;
|
||||||
|
const SOP_SUBS_PREV: u32 = 26u;
|
||||||
|
const SOP_SETP_EQ: u32 = 27u;
|
||||||
|
const SOP_SETP_NE: u32 = 28u;
|
||||||
|
const SOP_SETP_GT: u32 = 29u;
|
||||||
|
const SOP_SETP_GE: u32 = 30u;
|
||||||
|
const SOP_SETP_INV: u32 = 31u;
|
||||||
|
const SOP_SETP_POP: u32 = 32u;
|
||||||
|
const SOP_SETP_CLR: u32 = 33u;
|
||||||
|
const SOP_SETP_RSTR: u32 = 34u;
|
||||||
|
const SOP_KILLS_EQ: u32 = 35u;
|
||||||
|
const SOP_KILLS_GT: u32 = 36u;
|
||||||
|
const SOP_KILLS_GE: u32 = 37u;
|
||||||
|
const SOP_KILLS_NE: u32 = 38u;
|
||||||
|
const SOP_KILLS_ONE: u32 = 39u;
|
||||||
|
const SOP_SQRT: u32 = 40u;
|
||||||
|
const SOP_SIN: u32 = 48u;
|
||||||
|
const SOP_COS: u32 = 49u;
|
||||||
|
const SOP_RETAIN_PREV: u32 = 50u;
|
||||||
|
|
||||||
|
// Read a vec4 source from the register file. Treats the src index as a
|
||||||
|
// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
|
||||||
|
// M4+ will extend this to decode the full operand header.
|
||||||
|
fn read_src(idx: u32) -> vec4<f32> {
|
||||||
|
return registers[idx & 0x7Fu];
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
|
||||||
|
switch op {
|
||||||
|
case VOP_ADD: { return a + b; }
|
||||||
|
case VOP_MUL: { return a * b; }
|
||||||
|
case VOP_MAX: { return max(a, b); }
|
||||||
|
case VOP_MIN: { return min(a, b); }
|
||||||
|
case VOP_SEQ: {
|
||||||
|
return vec4<f32>(
|
||||||
|
select(0.0, 1.0, a.x == b.x),
|
||||||
|
select(0.0, 1.0, a.y == b.y),
|
||||||
|
select(0.0, 1.0, a.z == b.z),
|
||||||
|
select(0.0, 1.0, a.w == b.w),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
case VOP_SGT: {
|
||||||
|
return vec4<f32>(
|
||||||
|
select(0.0, 1.0, a.x > b.x),
|
||||||
|
select(0.0, 1.0, a.y > b.y),
|
||||||
|
select(0.0, 1.0, a.z > b.z),
|
||||||
|
select(0.0, 1.0, a.w > b.w),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
case VOP_SGE: {
|
||||||
|
return vec4<f32>(
|
||||||
|
select(0.0, 1.0, a.x >= b.x),
|
||||||
|
select(0.0, 1.0, a.y >= b.y),
|
||||||
|
select(0.0, 1.0, a.z >= b.z),
|
||||||
|
select(0.0, 1.0, a.w >= b.w),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
case VOP_SNE: {
|
||||||
|
return vec4<f32>(
|
||||||
|
select(0.0, 1.0, a.x != b.x),
|
||||||
|
select(0.0, 1.0, a.y != b.y),
|
||||||
|
select(0.0, 1.0, a.z != b.z),
|
||||||
|
select(0.0, 1.0, a.w != b.w),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
case VOP_FRC: { return fract(a); }
|
||||||
|
case VOP_TRUNC: { return trunc(a); }
|
||||||
|
case VOP_FLOOR: { return floor(a); }
|
||||||
|
case VOP_MAD: { return a * b + c; }
|
||||||
|
case VOP_CND_EQ: {
|
||||||
|
// dst = (src0 == 0) ? src1 : src2
|
||||||
|
return vec4<f32>(
|
||||||
|
select(c.x, b.x, a.x == 0.0),
|
||||||
|
select(c.y, b.y, a.y == 0.0),
|
||||||
|
select(c.z, b.z, a.z == 0.0),
|
||||||
|
select(c.w, b.w, a.w == 0.0),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
case VOP_CND_GE: {
|
||||||
|
return vec4<f32>(
|
||||||
|
select(c.x, b.x, a.x >= 0.0),
|
||||||
|
select(c.y, b.y, a.y >= 0.0),
|
||||||
|
select(c.z, b.z, a.z >= 0.0),
|
||||||
|
select(c.w, b.w, a.w >= 0.0),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
case VOP_CND_GT: {
|
||||||
|
return vec4<f32>(
|
||||||
|
select(c.x, b.x, a.x > 0.0),
|
||||||
|
select(c.y, b.y, a.y > 0.0),
|
||||||
|
select(c.z, b.z, a.z > 0.0),
|
||||||
|
select(c.w, b.w, a.w > 0.0),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
case VOP_DP4: {
|
||||||
|
let d = dot(a, b);
|
||||||
|
return vec4<f32>(d, d, d, d);
|
||||||
|
}
|
||||||
|
case VOP_DP3: {
|
||||||
|
let d = dot(a.xyz, b.xyz);
|
||||||
|
return vec4<f32>(d, d, d, d);
|
||||||
|
}
|
||||||
|
case VOP_DP2_ADD: {
|
||||||
|
let d = a.x * b.x + a.y * b.y + c.x;
|
||||||
|
return vec4<f32>(d, d, d, d);
|
||||||
|
}
|
||||||
|
case VOP_MAX4: {
|
||||||
|
let m = max(max(a.x, a.y), max(a.z, a.w));
|
||||||
|
return vec4<f32>(m, m, m, m);
|
||||||
|
}
|
||||||
|
case VOP_KILL_EQ: {
|
||||||
|
if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
|
||||||
|
kill_flag = true;
|
||||||
|
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||||
|
}
|
||||||
|
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||||
|
}
|
||||||
|
case VOP_KILL_GT: {
|
||||||
|
if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
|
||||||
|
kill_flag = true;
|
||||||
|
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||||
|
}
|
||||||
|
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||||
|
}
|
||||||
|
case VOP_KILL_GE: {
|
||||||
|
if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
|
||||||
|
kill_flag = true;
|
||||||
|
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||||
|
}
|
||||||
|
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||||
|
}
|
||||||
|
case VOP_KILL_NE: {
|
||||||
|
if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
|
||||||
|
kill_flag = true;
|
||||||
|
return vec4<f32>(1.0, 1.0, 1.0, 1.0);
|
||||||
|
}
|
||||||
|
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||||
|
}
|
||||||
|
case VOP_DST: {
|
||||||
|
// dest = (1, src0.y * src1.y, src0.z, src1.w)
|
||||||
|
return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
|
||||||
|
}
|
||||||
|
case VOP_CUBE, VOP_MAX_A: {
|
||||||
|
// Cube face projection + MAX+AR are rare in P3c's target set;
|
||||||
|
// forward to max() fallback so MAX_A degrades gracefully and
|
||||||
|
// CUBE does *something* useful (max-axis selection).
|
||||||
|
reject_mask |= REJECT_ALU_VEC;
|
||||||
|
return max(a, b);
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
// Unsupported — identity fallback + diagnostic flag.
|
||||||
|
reject_mask |= REJECT_ALU_VEC;
|
||||||
|
return vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar op executor. Takes:
|
||||||
|
// `src_a` / `src_b` — the two scalar source components (most ops use
|
||||||
|
// only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
|
||||||
|
// `prev` — current `ps` chain value.
|
||||||
|
// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
|
||||||
|
fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
|
||||||
|
switch op {
|
||||||
|
case SOP_ADDS: { return src_a + src_b; }
|
||||||
|
case SOP_ADDS_PREV: { return src_a + prev; }
|
||||||
|
case SOP_MULS: { return src_a * src_b; }
|
||||||
|
case SOP_MULS_PREV: { return src_a * prev; }
|
||||||
|
case SOP_SUBS: { return src_a - src_b; }
|
||||||
|
case SOP_SUBS_PREV: { return src_a - prev; }
|
||||||
|
case SOP_MAXS: { return max(src_a, src_b); }
|
||||||
|
case SOP_MINS: { return min(src_a, src_b); }
|
||||||
|
case SOP_SEQS: { return select(0.0, 1.0, src_a == 0.0); }
|
||||||
|
case SOP_SGTS: { return select(0.0, 1.0, src_a > 0.0); }
|
||||||
|
case SOP_SGES: { return select(0.0, 1.0, src_a >= 0.0); }
|
||||||
|
case SOP_SNES: { return select(0.0, 1.0, src_a != 0.0); }
|
||||||
|
case SOP_FRCS: { return fract(src_a); }
|
||||||
|
case SOP_TRUNCS: { return trunc(src_a); }
|
||||||
|
case SOP_FLOORS: { return floor(src_a); }
|
||||||
|
case SOP_EXP: {
|
||||||
|
// exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
|
||||||
|
// WGSL's exp2 already produces for 0.
|
||||||
|
return exp2(src_a);
|
||||||
|
}
|
||||||
|
case SOP_LOG, SOP_LOGC: {
|
||||||
|
// log/logc both compute log2. Canary's LOGC clamps -INF to
|
||||||
|
// -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
|
||||||
|
// inputs > 0, and we leave src <= 0 to be the shader author's
|
||||||
|
// problem (identity fallback via select).
|
||||||
|
return select(log2(src_a), 0.0, src_a == 1.0);
|
||||||
|
}
|
||||||
|
case SOP_RCP, SOP_RCPC, SOP_RCPF: {
|
||||||
|
// IEEE reciprocal with src==0 guard; the clamp-variants differ
|
||||||
|
// only in how they treat INF/NaN, which is shader-author-rare.
|
||||||
|
return select(0.0, 1.0 / src_a, src_a != 0.0);
|
||||||
|
}
|
||||||
|
case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
|
||||||
|
return select(0.0, inverseSqrt(src_a), src_a > 0.0);
|
||||||
|
}
|
||||||
|
case SOP_SQRT: { return select(0.0, sqrt(src_a), src_a >= 0.0); }
|
||||||
|
case SOP_SIN: { return sin(src_a); }
|
||||||
|
case SOP_COS: { return cos(src_a); }
|
||||||
|
// Predicate writes — update `predicate` and produce a result that
|
||||||
|
// the surrounding ALU slot can still consume via `ps`. Canary's
|
||||||
|
// setp-variant dst-write semantics are preserved.
|
||||||
|
case SOP_SETP_EQ: {
|
||||||
|
predicate = (src_a == 0.0);
|
||||||
|
return select(1.0, 0.0, src_a == 0.0);
|
||||||
|
}
|
||||||
|
case SOP_SETP_NE: {
|
||||||
|
predicate = (src_a != 0.0);
|
||||||
|
return select(1.0, 0.0, src_a != 0.0);
|
||||||
|
}
|
||||||
|
case SOP_SETP_GT: {
|
||||||
|
predicate = (src_a > 0.0);
|
||||||
|
return select(1.0, 0.0, src_a > 0.0);
|
||||||
|
}
|
||||||
|
case SOP_SETP_GE: {
|
||||||
|
predicate = (src_a >= 0.0);
|
||||||
|
return select(1.0, 0.0, src_a >= 0.0);
|
||||||
|
}
|
||||||
|
case SOP_SETP_INV: {
|
||||||
|
if src_a == 1.0 {
|
||||||
|
predicate = true;
|
||||||
|
return 0.0;
|
||||||
|
} else {
|
||||||
|
predicate = false;
|
||||||
|
return select(src_a, 1.0, src_a == 0.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case SOP_SETP_POP: {
|
||||||
|
if src_a - 1.0 <= 0.0 {
|
||||||
|
predicate = true;
|
||||||
|
return 0.0;
|
||||||
|
} else {
|
||||||
|
predicate = false;
|
||||||
|
return src_a - 1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case SOP_SETP_CLR: {
|
||||||
|
predicate = false;
|
||||||
|
// FLT_MAX sentinel. WGSL's bitcast keeps this portable.
|
||||||
|
return bitcast<f32>(0x7F7FFFFFu);
|
||||||
|
}
|
||||||
|
case SOP_SETP_RSTR: {
|
||||||
|
if src_a == 0.0 {
|
||||||
|
predicate = true;
|
||||||
|
return 0.0;
|
||||||
|
} else {
|
||||||
|
predicate = false;
|
||||||
|
return src_a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
|
||||||
|
// canary, and `discard` runs at fragment exit when the flag is set.
|
||||||
|
case SOP_KILLS_EQ: {
|
||||||
|
if src_a == 0.0 { kill_flag = true; return 1.0; }
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
case SOP_KILLS_GT: {
|
||||||
|
if src_a > 0.0 { kill_flag = true; return 1.0; }
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
case SOP_KILLS_GE: {
|
||||||
|
if src_a >= 0.0 { kill_flag = true; return 1.0; }
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
case SOP_KILLS_NE: {
|
||||||
|
if src_a != 0.0 { kill_flag = true; return 1.0; }
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
case SOP_KILLS_ONE: {
|
||||||
|
if src_a == 1.0 { kill_flag = true; return 1.0; }
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
case SOP_RETAIN_PREV: { return prev; }
|
||||||
|
default: {
|
||||||
|
reject_mask |= REJECT_ALU_SCL;
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn interpret_alu(t: u32, is_vertex: bool) {
|
||||||
|
// Read the 3-dword instruction triple.
|
||||||
|
var w0: u32;
|
||||||
|
var w1: u32;
|
||||||
|
var w2: u32;
|
||||||
|
if is_vertex {
|
||||||
|
w0 = vs_instr_dword(t, 0u);
|
||||||
|
w1 = vs_instr_dword(t, 1u);
|
||||||
|
w2 = vs_instr_dword(t, 2u);
|
||||||
|
} else {
|
||||||
|
w0 = ps_instr_dword(t, 0u);
|
||||||
|
w1 = ps_instr_dword(t, 1u);
|
||||||
|
w2 = ps_instr_dword(t, 2u);
|
||||||
|
}
|
||||||
|
// Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
|
||||||
|
let vec_op = w2 & 0x3Fu;
|
||||||
|
let scl_op = (w2 >> 6u) & 0x3Fu;
|
||||||
|
let vec_dst = (w2 >> 16u) & 0x7Fu;
|
||||||
|
let scl_dst = (w2 >> 24u) & 0x7Fu;
|
||||||
|
let vec_wm = (w2 >> 12u) & 0xFu;
|
||||||
|
let scl_wm = (w2 >> 8u) & 0xFu;
|
||||||
|
let src_a = w0 & 0xFFu;
|
||||||
|
let src_b = (w0 >> 8u) & 0xFFu;
|
||||||
|
let src_c = (w0 >> 16u) & 0xFFu;
|
||||||
|
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
||||||
|
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
||||||
|
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
||||||
|
// `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
|
||||||
|
// treats every source as a full r#, no modifiers — M4+ decodes it.
|
||||||
|
_ = w1;
|
||||||
|
|
||||||
|
// Honor per-instruction predicate: skip when predicated and the
|
||||||
|
// predicate doesn't match the required condition.
|
||||||
|
if predicated && (predicate != predicate_condition) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Vector pipe.
|
||||||
|
let a = read_src(src_a);
|
||||||
|
let b = read_src(src_b);
|
||||||
|
let c = read_src(src_c);
|
||||||
|
let vec_result = exec_vector_op(vec_op, a, b, c);
|
||||||
|
if vec_wm != 0u {
|
||||||
|
write_reg_masked(vec_dst, vec_wm, vec_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
|
||||||
|
// read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
|
||||||
|
// is set the operand selector chooses `ps` as the primary source.
|
||||||
|
let scl_src_a = select(a.x, ps, scalar_src_is_ps);
|
||||||
|
let scl_src_b = b.x;
|
||||||
|
let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
|
||||||
|
ps = new_ps;
|
||||||
|
if scl_wm != 0u {
|
||||||
|
write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Xenos VertexFormat values from `xenos.h:641`.
|
||||||
|
const VFMT_8_8_8_8: u32 = 6u;
|
||||||
|
const VFMT_2_10_10_10: u32 = 7u;
|
||||||
|
const VFMT_10_11_11: u32 = 16u;
|
||||||
|
const VFMT_11_11_10: u32 = 17u;
|
||||||
|
const VFMT_16_16: u32 = 25u;
|
||||||
|
const VFMT_16_16_16_16: u32 = 26u;
|
||||||
|
const VFMT_16_16_FLOAT: u32 = 31u;
|
||||||
|
const VFMT_16_16_16_16_FLOAT:u32 = 32u;
|
||||||
|
const VFMT_32: u32 = 33u;
|
||||||
|
const VFMT_32_32: u32 = 34u;
|
||||||
|
const VFMT_32_32_32_32: u32 = 35u;
|
||||||
|
const VFMT_32_FLOAT: u32 = 36u;
|
||||||
|
const VFMT_32_32_FLOAT: u32 = 37u;
|
||||||
|
const VFMT_32_32_32_32_FLOAT:u32 = 38u;
|
||||||
|
const VFMT_32_32_32_FLOAT: u32 = 57u;
|
||||||
|
|
||||||
|
// Decode vertex fetch instruction fields (canary's VertexFetchInstruction
|
||||||
|
// layout in `ucode.h:690`):
|
||||||
|
// w0 [4:0] opcode
|
||||||
|
// w0 [10:5] src_reg[5:0]
|
||||||
|
// w0 [17:11] dst_reg[6:0] + must-be-one
|
||||||
|
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
|
||||||
|
// w1 [21:16] format[5:0]
|
||||||
|
// w2 [7:0] stride (in dwords)
|
||||||
|
// w2 [30:8] offset (signed, in dwords)
|
||||||
|
//
|
||||||
|
// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
|
||||||
|
// sign/normalization flags are used for the most-common normalized-unsigned
|
||||||
|
// path. Rejects set `REJECT_VFETCH_FMT`.
|
||||||
|
fn interpret_vertex_fetch(t: u32) {
|
||||||
|
let w0 = vs_instr_dword(t, 0u);
|
||||||
|
let w1 = vs_instr_dword(t, 1u);
|
||||||
|
let w2 = vs_instr_dword(t, 2u);
|
||||||
|
let fetch_const = (w0 >> 5u) & 0x1Fu;
|
||||||
|
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||||
|
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||||
|
let format = (w1 >> 16u) & 0x3Fu;
|
||||||
|
let stride = w2 & 0xFFu;
|
||||||
|
|
||||||
|
// Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
|
||||||
|
// dword 1 carries (endian[1:0], size[25:2]).
|
||||||
|
let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
|
||||||
|
let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
|
||||||
|
|
||||||
|
let vidx = u32(registers[src_reg & 0x7Fu].x);
|
||||||
|
// Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
|
||||||
|
let effective_stride = select(stride, 4u, stride == 0u);
|
||||||
|
let addr = base_dwords + vidx * effective_stride;
|
||||||
|
|
||||||
|
let n = arrayLength(&vertex_buffer);
|
||||||
|
var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);
|
||||||
|
|
||||||
|
switch format {
|
||||||
|
case VFMT_32_32_32_32_FLOAT: {
|
||||||
|
if addr + 3u < n {
|
||||||
|
result = vec4<f32>(
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 2u]),
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 3u]),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_32_32_32_FLOAT: {
|
||||||
|
if addr + 2u < n {
|
||||||
|
result = vec4<f32>(
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 2u]),
|
||||||
|
1.0,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_32_32_FLOAT: {
|
||||||
|
if addr + 1u < n {
|
||||||
|
result = vec4<f32>(
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 0u]),
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 1u]),
|
||||||
|
0.0,
|
||||||
|
1.0,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_32_FLOAT: {
|
||||||
|
if addr < n {
|
||||||
|
result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_8_8_8_8: {
|
||||||
|
if addr < n {
|
||||||
|
result = unpack4x8unorm(vertex_buffer[addr]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_16_16_FLOAT: {
|
||||||
|
if addr < n {
|
||||||
|
let h = unpack2x16float(vertex_buffer[addr]);
|
||||||
|
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_16_16_16_16_FLOAT: {
|
||||||
|
if addr + 1u < n {
|
||||||
|
let h0 = unpack2x16float(vertex_buffer[addr]);
|
||||||
|
let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
|
||||||
|
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_16_16: {
|
||||||
|
if addr < n {
|
||||||
|
// Default to signed normalized; unsigned variants differ
|
||||||
|
// only for sign-extension and are less common on Xenos VBs.
|
||||||
|
let h = unpack2x16snorm(vertex_buffer[addr]);
|
||||||
|
result = vec4<f32>(h.x, h.y, 0.0, 1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_16_16_16_16: {
|
||||||
|
if addr + 1u < n {
|
||||||
|
let h0 = unpack2x16snorm(vertex_buffer[addr]);
|
||||||
|
let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
|
||||||
|
result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case VFMT_2_10_10_10: {
|
||||||
|
// Unpack 10-bit R/G/B (signed or unsigned, default unsigned
|
||||||
|
// normalized) + 2-bit A. Unnormalized not exercised here.
|
||||||
|
if addr < n {
|
||||||
|
let packed = vertex_buffer[addr];
|
||||||
|
let r = f32(packed & 0x3FFu) / 1023.0;
|
||||||
|
let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
|
||||||
|
let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
|
||||||
|
let a = f32((packed >> 30u) & 0x3u) / 3.0;
|
||||||
|
result = vec4<f32>(r, g, b, a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
reject_mask |= REJECT_VFETCH_FMT;
|
||||||
|
// Identity fallback preserves vertex-index visibility.
|
||||||
|
if addr < n {
|
||||||
|
result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
registers[dst_reg & 0x7Fu] = result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
|
||||||
|
// dummy bound at group(1); the real per-slot texture cache lands with P5.
|
||||||
|
// Reads (u, v) from the source register's .xy and writes the sample into
|
||||||
|
// the destination register. `textureSampleLevel` works in both VS and PS
|
||||||
|
// (no implicit derivatives), so no per-stage specialisation needed.
|
||||||
|
fn interpret_texture_fetch(t: u32, is_vertex: bool) {
|
||||||
|
var w0: u32 = 0u;
|
||||||
|
if is_vertex {
|
||||||
|
w0 = vs_instr_dword(t, 0u);
|
||||||
|
} else {
|
||||||
|
w0 = ps_instr_dword(t, 0u);
|
||||||
|
}
|
||||||
|
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||||
|
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||||
|
let uv = registers[src_reg & 0x7Fu].xy;
|
||||||
|
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
|
||||||
|
registers[dst_reg & 0x7Fu] = sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Walk an Exec clause's instruction triples.
|
||||||
|
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
|
||||||
|
// (we ignore in MVP); bit 1 = is-fetch.
|
||||||
|
fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||||
|
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||||
|
let t = address + i;
|
||||||
|
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||||
|
if is_fetch {
|
||||||
|
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
|
||||||
|
// 0x00 = vertex fetch, 0x01 = texture fetch.
|
||||||
|
if opcode == 0u {
|
||||||
|
interpret_vertex_fetch(t);
|
||||||
|
} else if opcode == 1u {
|
||||||
|
interpret_texture_fetch(t, true);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
interpret_alu(t, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn exec_ps(address: u32, count: u32, sequence: u32) {
|
||||||
|
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||||
|
let t = address + i;
|
||||||
|
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||||
|
if is_fetch {
|
||||||
|
interpret_texture_fetch(t, false);
|
||||||
|
} else {
|
||||||
|
interpret_alu(t, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset the per-invocation register state to a known baseline.
|
||||||
|
fn reset_state() {
|
||||||
|
for (var i: u32 = 0u; i < 128u; i = i + 1u) {
|
||||||
|
registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
|
||||||
|
}
|
||||||
|
ps = 0.0;
|
||||||
|
predicate = false;
|
||||||
|
current_alloc = 0u;
|
||||||
|
kill_flag = false;
|
||||||
|
loop_depth = 0u;
|
||||||
|
reject_mask = 0u;
|
||||||
|
for (var i: u32 = 0u; i < 4u; i = i + 1u) {
|
||||||
|
loop_counters[i] = 0u;
|
||||||
|
loop_starts[i] = 0u;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Stage entry points.
|
||||||
|
|
||||||
|
// M7 register slots for exports. VS writes position at oPos (convention:
|
||||||
|
// Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
|
||||||
|
// and a set of interpolators. We track both via `current_alloc`: writes
|
||||||
|
// inside each alloc range are tagged and copied out at Exit.
|
||||||
|
const OPOS_REG: u32 = 32u; // synthetic slot used by the interpreter
|
||||||
|
const OCOLOR_REG: u32 = 33u; // color0 scratch slot
|
||||||
|
|
||||||
|
@vertex
|
||||||
|
fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
|
||||||
|
reset_state();
|
||||||
|
|
||||||
|
// Seed r0 with the vertex index so simple shaders (or the procedural
|
||||||
|
// fallback) have access without a real vertex fetch.
|
||||||
|
registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
|
||||||
|
// Seed the export slots with a procedural fallback: if the shader
|
||||||
|
// never writes oPos / oColor, this keeps the output visible rather
|
||||||
|
// than collapsing to (0,0) which would skip rasterization.
|
||||||
|
let total = max(draw_ctx.vertex_count, 1u);
|
||||||
|
let t_param = f32(vidx) / f32(total);
|
||||||
|
let angle = t_param * 6.2831853;
|
||||||
|
let radius = 0.35;
|
||||||
|
registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
|
||||||
|
let d = f32(draw_ctx.draw_index);
|
||||||
|
registers[OCOLOR_REG] = vec4<f32>(
|
||||||
|
0.5 + 0.5 * sin(d * 0.37),
|
||||||
|
0.5 + 0.5 * sin(d * 0.51 + 2.0),
|
||||||
|
0.5 + 0.5 * sin(d * 0.73 + 4.0),
|
||||||
|
1.0,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Dead-binding guard for VERTEX-stage-only vertex_buffer access.
|
||||||
|
let vb_live = f32(vertex_buffer[0]) * 0.0;
|
||||||
|
|
||||||
|
// Walk the VS CF table.
|
||||||
|
walk_cf_vs();
|
||||||
|
|
||||||
|
var out: VsOut;
|
||||||
|
// Use registers[OPOS_REG] as position; the procedural fallback above
|
||||||
|
// seeded it so an un-interpreted shader still draws a recognisable
|
||||||
|
// circle.
|
||||||
|
out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
|
||||||
|
out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
@fragment
|
||||||
|
fn fs_main(in: VsOut) -> FsOut {
|
||||||
|
reset_state();
|
||||||
|
|
||||||
|
walk_cf_ps();
|
||||||
|
|
||||||
|
// Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
|
||||||
|
// `discard` at the entry level (outside any helper) is the only way to
|
||||||
|
// guarantee early-out in WGSL.
|
||||||
|
if kill_flag {
|
||||||
|
discard;
|
||||||
|
}
|
||||||
|
|
||||||
|
var out: FsOut;
|
||||||
|
out.color0 = in.color;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
|
||||||
|
// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
|
||||||
|
// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
|
||||||
|
// we don't have). A hard iteration cap keeps the GPU from hanging on
|
||||||
|
// malformed or extreme shaders.
|
||||||
|
const CF_WALKER_MAX_ITER: u32 = 4096u;
|
||||||
|
|
||||||
|
fn walk_cf_vs() {
|
||||||
|
let cf_n = vs_cf_count();
|
||||||
|
var cf_i: u32 = 0u;
|
||||||
|
var iter: u32 = 0u;
|
||||||
|
loop {
|
||||||
|
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
|
||||||
|
iter = iter + 1u;
|
||||||
|
let kind = vs_cf_kind(cf_i);
|
||||||
|
let primary = vs_cf_primary(cf_i);
|
||||||
|
let aux = vs_cf_aux(cf_i);
|
||||||
|
var advance: bool = true;
|
||||||
|
var stop: bool = false;
|
||||||
|
switch kind {
|
||||||
|
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
|
||||||
|
let count = aux & 0xFFu;
|
||||||
|
let sequence = aux >> 8u;
|
||||||
|
exec_vs(primary, count, sequence);
|
||||||
|
if kind == CF_KIND_EXEC_END { stop = true; }
|
||||||
|
}
|
||||||
|
case CF_KIND_ALLOC: { current_alloc = primary; }
|
||||||
|
case CF_KIND_EXIT: { stop = true; }
|
||||||
|
case CF_KIND_LOOP_START: {
|
||||||
|
let loop_id = aux & 0x1Fu;
|
||||||
|
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
|
||||||
|
if loop_count > 16u {
|
||||||
|
loop_count = 16u;
|
||||||
|
reject_mask |= REJECT_LOOP_OVERFLOW;
|
||||||
|
}
|
||||||
|
if loop_count > 0u && loop_depth < 4u {
|
||||||
|
loop_starts[loop_depth] = cf_i;
|
||||||
|
loop_counters[loop_depth] = loop_count;
|
||||||
|
loop_depth = loop_depth + 1u;
|
||||||
|
}
|
||||||
|
// count==0 → fall through; matching LOOP_END will pop.
|
||||||
|
}
|
||||||
|
case CF_KIND_LOOP_END: {
|
||||||
|
if loop_depth > 0u {
|
||||||
|
let d = loop_depth - 1u;
|
||||||
|
if loop_counters[d] > 1u {
|
||||||
|
loop_counters[d] = loop_counters[d] - 1u;
|
||||||
|
cf_i = loop_starts[d] + 1u;
|
||||||
|
advance = false;
|
||||||
|
} else {
|
||||||
|
loop_counters[d] = 0u;
|
||||||
|
loop_depth = d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case CF_KIND_COND_JMP: {
|
||||||
|
let pred_bits = aux;
|
||||||
|
let is_pred = (pred_bits & 1u) != 0u;
|
||||||
|
let pred_cnd = (pred_bits & 2u) != 0u;
|
||||||
|
if !is_pred || predicate == pred_cnd {
|
||||||
|
cf_i = primary;
|
||||||
|
advance = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
|
||||||
|
// No call stack — mark and continue.
|
||||||
|
reject_mask |= REJECT_CF_CALL;
|
||||||
|
}
|
||||||
|
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||||
|
}
|
||||||
|
if stop { break; }
|
||||||
|
if advance { cf_i = cf_i + 1u; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn walk_cf_ps() {
|
||||||
|
let cf_n = ps_cf_count();
|
||||||
|
var cf_i: u32 = 0u;
|
||||||
|
var iter: u32 = 0u;
|
||||||
|
loop {
|
||||||
|
if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
|
||||||
|
iter = iter + 1u;
|
||||||
|
let kind = ps_cf_kind(cf_i);
|
||||||
|
let primary = ps_cf_primary(cf_i);
|
||||||
|
let aux = ps_cf_aux(cf_i);
|
||||||
|
var advance: bool = true;
|
||||||
|
var stop: bool = false;
|
||||||
|
switch kind {
|
||||||
|
case CF_KIND_EXEC, CF_KIND_EXEC_END: {
|
||||||
|
let count = aux & 0xFFu;
|
||||||
|
let sequence = aux >> 8u;
|
||||||
|
exec_ps(primary, count, sequence);
|
||||||
|
if kind == CF_KIND_EXEC_END { stop = true; }
|
||||||
|
}
|
||||||
|
case CF_KIND_ALLOC: { current_alloc = primary; }
|
||||||
|
case CF_KIND_EXIT: { stop = true; }
|
||||||
|
case CF_KIND_LOOP_START: {
|
||||||
|
let loop_id = aux & 0x1Fu;
|
||||||
|
var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
|
||||||
|
if loop_count > 16u {
|
||||||
|
loop_count = 16u;
|
||||||
|
reject_mask |= REJECT_LOOP_OVERFLOW;
|
||||||
|
}
|
||||||
|
if loop_count > 0u && loop_depth < 4u {
|
||||||
|
loop_starts[loop_depth] = cf_i;
|
||||||
|
loop_counters[loop_depth] = loop_count;
|
||||||
|
loop_depth = loop_depth + 1u;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case CF_KIND_LOOP_END: {
|
||||||
|
if loop_depth > 0u {
|
||||||
|
let d = loop_depth - 1u;
|
||||||
|
if loop_counters[d] > 1u {
|
||||||
|
loop_counters[d] = loop_counters[d] - 1u;
|
||||||
|
cf_i = loop_starts[d] + 1u;
|
||||||
|
advance = false;
|
||||||
|
} else {
|
||||||
|
loop_counters[d] = 0u;
|
||||||
|
loop_depth = d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case CF_KIND_COND_JMP: {
|
||||||
|
let pred_bits = aux;
|
||||||
|
let is_pred = (pred_bits & 1u) != 0u;
|
||||||
|
let pred_cnd = (pred_bits & 2u) != 0u;
|
||||||
|
if !is_pred || predicate == pred_cnd {
|
||||||
|
cf_i = primary;
|
||||||
|
advance = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case CF_KIND_COND_CALL, CF_KIND_RETURN: {
|
||||||
|
reject_mask |= REJECT_CF_CALL;
|
||||||
|
}
|
||||||
|
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||||
|
}
|
||||||
|
if stop { break; }
|
||||||
|
if advance { cf_i = cf_i + 1u; }
|
||||||
|
}
|
||||||
|
}
|
||||||
970
crates/xenia-gpu/src/texture_cache.rs
Normal file
970
crates/xenia-gpu/src/texture_cache.rs
Normal file
@@ -0,0 +1,970 @@
|
|||||||
|
//! Texture cache — P5.
|
||||||
|
//!
|
||||||
|
//! Two-layer design mirroring canary's `TextureCache`:
|
||||||
|
//!
|
||||||
|
//! * **CPU layer** (this module): owns decoded, linear, host-endian texel
|
||||||
|
//! byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the
|
||||||
|
//! guest memory's page-version counter to decide whether the cached
|
||||||
|
//! bytes are still fresh and re-decodes on miss or staleness.
|
||||||
|
//! * **GPU layer** (xenia-ui `texture_cache_host`): owns the
|
||||||
|
//! `wgpu::Texture` + `TextureView` for each cached key; pulls decoded
|
||||||
|
//! bytes from this CPU layer on upload.
|
||||||
|
//!
|
||||||
|
//! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and
|
||||||
|
//! `texture_info_formats.inl` for the format table.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::tiled_address;
|
||||||
|
|
||||||
|
/// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`.
|
||||||
|
/// Values are the raw enum numbers the guest writes into
|
||||||
|
/// `xe_gpu_texture_fetch_t.format`.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
#[repr(u8)]
|
||||||
|
pub enum TextureFormat {
|
||||||
|
K1Reverse = 0,
|
||||||
|
K1 = 1,
|
||||||
|
K8 = 2,
|
||||||
|
K1555 = 3,
|
||||||
|
K565 = 4,
|
||||||
|
K6_5_5 = 5,
|
||||||
|
K8888 = 6,
|
||||||
|
K1010102 = 7,
|
||||||
|
K8_8 = 10,
|
||||||
|
K4_4_4_4 = 15,
|
||||||
|
K10_11_11 = 16,
|
||||||
|
K11_11_10 = 17,
|
||||||
|
Dxt1 = 18,
|
||||||
|
Dxt2_3 = 19,
|
||||||
|
Dxt4_5 = 20,
|
||||||
|
K24_8 = 22,
|
||||||
|
K24_8Float = 23,
|
||||||
|
K16 = 24,
|
||||||
|
K16_16 = 25,
|
||||||
|
K16_16_16_16 = 26,
|
||||||
|
K16Float = 30,
|
||||||
|
K16_16Float = 31,
|
||||||
|
K16_16_16_16Float = 32,
|
||||||
|
K32 = 33,
|
||||||
|
K32_32 = 34,
|
||||||
|
K32_32_32_32 = 35,
|
||||||
|
K32Float = 36,
|
||||||
|
K32_32Float = 37,
|
||||||
|
K32_32_32_32Float = 38,
|
||||||
|
Unknown(u8),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TextureFormat {
|
||||||
|
pub fn from_raw(v: u8) -> Self {
|
||||||
|
use TextureFormat::*;
|
||||||
|
match v & 0x3F {
|
||||||
|
0 => K1Reverse,
|
||||||
|
1 => K1,
|
||||||
|
2 => K8,
|
||||||
|
3 => K1555,
|
||||||
|
4 => K565,
|
||||||
|
5 => K6_5_5,
|
||||||
|
6 => K8888,
|
||||||
|
7 => K1010102,
|
||||||
|
10 => K8_8,
|
||||||
|
15 => K4_4_4_4,
|
||||||
|
16 => K10_11_11,
|
||||||
|
17 => K11_11_10,
|
||||||
|
18 => Dxt1,
|
||||||
|
19 => Dxt2_3,
|
||||||
|
20 => Dxt4_5,
|
||||||
|
22 => K24_8,
|
||||||
|
23 => K24_8Float,
|
||||||
|
24 => K16,
|
||||||
|
25 => K16_16,
|
||||||
|
26 => K16_16_16_16,
|
||||||
|
30 => K16Float,
|
||||||
|
31 => K16_16Float,
|
||||||
|
32 => K16_16_16_16Float,
|
||||||
|
33 => K32,
|
||||||
|
34 => K32_32,
|
||||||
|
35 => K32_32_32_32,
|
||||||
|
36 => K32Float,
|
||||||
|
37 => K32_32Float,
|
||||||
|
38 => K32_32_32_32Float,
|
||||||
|
other => Unknown(other),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Block width/height in texels + bytes-per-block. For uncompressed
|
||||||
|
/// formats block_w = block_h = 1. For DXT formats block_w = block_h =
|
||||||
|
/// 4 (one 4×4 compressed block).
|
||||||
|
pub fn block_info(self) -> BlockInfo {
|
||||||
|
use TextureFormat::*;
|
||||||
|
match self {
|
||||||
|
K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte
|
||||||
|
K8 => BlockInfo::new(1, 1, 1),
|
||||||
|
K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2),
|
||||||
|
K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16
|
||||||
|
| K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4),
|
||||||
|
K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8),
|
||||||
|
K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16),
|
||||||
|
Dxt1 => BlockInfo::new(4, 4, 8),
|
||||||
|
Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16),
|
||||||
|
Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True iff this format lands on a wgpu texture format we can
|
||||||
|
/// natively bind — no CPU-side conversion per frame required. M5
|
||||||
|
/// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still
|
||||||
|
/// counts as supported for the host-cache wiring), `k_DXT2_3`
|
||||||
|
/// (BC2), and `k_DXT4_5` (BC3).
|
||||||
|
pub fn is_host_supported(self) -> bool {
|
||||||
|
matches!(
|
||||||
|
self,
|
||||||
|
TextureFormat::K8888
|
||||||
|
| TextureFormat::K565
|
||||||
|
| TextureFormat::Dxt1
|
||||||
|
| TextureFormat::Dxt2_3
|
||||||
|
| TextureFormat::Dxt4_5
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub struct BlockInfo {
|
||||||
|
pub block_w: u8,
|
||||||
|
pub block_h: u8,
|
||||||
|
pub bytes_per_block: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BlockInfo {
|
||||||
|
pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self {
|
||||||
|
Self {
|
||||||
|
block_w,
|
||||||
|
block_h,
|
||||||
|
bytes_per_block,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn log2_bpb(self) -> u32 {
|
||||||
|
match self.bytes_per_block {
|
||||||
|
1 => 0,
|
||||||
|
2 => 1,
|
||||||
|
4 => 2,
|
||||||
|
8 => 3,
|
||||||
|
16 => 4,
|
||||||
|
_ => 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum Endian {
|
||||||
|
None = 0,
|
||||||
|
Swap8In16 = 1,
|
||||||
|
Swap8In32 = 2,
|
||||||
|
Swap16In32 = 3,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Endian {
|
||||||
|
pub fn from_raw(v: u8) -> Self {
|
||||||
|
match v & 0x3 {
|
||||||
|
1 => Endian::Swap8In16,
|
||||||
|
2 => Endian::Swap8In32,
|
||||||
|
3 => Endian::Swap16In32,
|
||||||
|
_ => Endian::None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply this endian's byte swap to one 32-bit unit. Matches canary's
|
||||||
|
/// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls
|
||||||
|
/// the same mask-shift pattern.
|
||||||
|
pub fn swap32(self, v: u32) -> u32 {
|
||||||
|
match self {
|
||||||
|
Endian::None => v,
|
||||||
|
Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8),
|
||||||
|
Endian::Swap8In32 => v.swap_bytes(),
|
||||||
|
Endian::Swap16In32 => v.rotate_right(16),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Texture dimensionality (`xenos::DataDimension`).
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum Dimension {
|
||||||
|
D1 = 0,
|
||||||
|
D2 = 1,
|
||||||
|
D3Stacked = 2,
|
||||||
|
Cube = 3,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Dimension {
|
||||||
|
pub fn from_raw(v: u8) -> Self {
|
||||||
|
match v & 0x3 {
|
||||||
|
1 => Dimension::D2,
|
||||||
|
2 => Dimension::D3Stacked,
|
||||||
|
3 => Dimension::Cube,
|
||||||
|
_ => Dimension::D1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Identity of a cached texture. Matches canary's `TextureCache::TextureKey`
|
||||||
|
/// at the semantic level — we exclude mip/border state for P5 since neither
|
||||||
|
/// is populated yet.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub struct TextureKey {
|
||||||
|
/// Guest physical base (byte address — already shifted left by 12 from
|
||||||
|
/// the fetch-constant `base_address` field).
|
||||||
|
pub base_address: u32,
|
||||||
|
pub width: u16,
|
||||||
|
pub height: u16,
|
||||||
|
pub depth_or_slices: u16,
|
||||||
|
pub format: TextureFormat,
|
||||||
|
pub endian: Endian,
|
||||||
|
pub dimension: Dimension,
|
||||||
|
pub tiled: bool,
|
||||||
|
/// Row pitch in texels, already aligned to 32. Canary stores pitch/32
|
||||||
|
/// in the fetch constant; we keep the raw texel count to avoid
|
||||||
|
/// callers remembering to shift.
|
||||||
|
pub pitch_texels: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`).
|
||||||
|
/// Returns `None` if the constant is obviously unset (all zeros) or if
|
||||||
|
/// `type` is not the texture-constant marker.
|
||||||
|
pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option<TextureKey> {
|
||||||
|
let d0 = dwords[0];
|
||||||
|
let d1 = dwords[1];
|
||||||
|
let d2 = dwords[2];
|
||||||
|
let d5 = dwords[5];
|
||||||
|
|
||||||
|
// type: low 2 bits of dword 0 should be 2 (texture) per canary —
|
||||||
|
// 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so
|
||||||
|
// `None` filters it out here.
|
||||||
|
let ty = d0 & 0x3;
|
||||||
|
if d0 == 0 && d1 == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// Not a texture constant (e.g. 0 = vertex fetch constant reused).
|
||||||
|
if ty != 2 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels
|
||||||
|
let tiled = ((d0 >> 31) & 1) != 0;
|
||||||
|
let format = TextureFormat::from_raw((d1 & 0x3F) as u8);
|
||||||
|
let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8);
|
||||||
|
let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted.
|
||||||
|
let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8);
|
||||||
|
|
||||||
|
// Size decode depends on dimension.
|
||||||
|
let (width, height, depth) = match dim {
|
||||||
|
Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16),
|
||||||
|
Dimension::D2 => (
|
||||||
|
(d2 & 0x1FFF) as u16 + 1,
|
||||||
|
((d2 >> 13) & 0x1FFF) as u16 + 1,
|
||||||
|
((d2 >> 26) & 0x3F) as u16 + 1,
|
||||||
|
),
|
||||||
|
Dimension::D3Stacked | Dimension::Cube => (
|
||||||
|
(d2 & 0x7FF) as u16 + 1,
|
||||||
|
((d2 >> 11) & 0x7FF) as u16 + 1,
|
||||||
|
((d2 >> 22) & 0x3FF) as u16 + 1,
|
||||||
|
),
|
||||||
|
};
|
||||||
|
|
||||||
|
Some(TextureKey {
|
||||||
|
base_address,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
depth_or_slices: depth,
|
||||||
|
format,
|
||||||
|
endian,
|
||||||
|
dimension: dim,
|
||||||
|
tiled,
|
||||||
|
pitch_texels: ((pitch_5 as u16) * 32).max(width),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decoded, linear, host-endian texture bytes ready for wgpu upload.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct CachedTexture {
|
||||||
|
pub key: TextureKey,
|
||||||
|
pub version_when_uploaded: u64,
|
||||||
|
/// Tightly packed. Layout depends on `key.format`:
|
||||||
|
/// - `K8888` → `width*height*4` bytes in Rgba8Unorm order.
|
||||||
|
/// - `Dxt1` → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after
|
||||||
|
/// block-level detile + dword-endian swap.
|
||||||
|
pub bytes: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CachedTexture {
|
||||||
|
pub fn byte_size(&self) -> usize {
|
||||||
|
self.bytes.len()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Errors that can happen during decode. The `ensure_cached` caller maps
|
||||||
|
/// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when
|
||||||
|
/// a texture fell back.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum DecodeError {
|
||||||
|
UnsupportedFormat,
|
||||||
|
OutOfBounds,
|
||||||
|
ZeroSize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read `len` bytes from guest memory starting at `addr`. Returns `None`
|
||||||
|
/// if the span would exceed the memory's reported end; otherwise returns
|
||||||
|
/// a freshly-allocated buffer with the bytes.
|
||||||
|
///
|
||||||
|
/// The `MemoryAccess` trait exposes per-byte reads only; we batch them in
|
||||||
|
/// a single pass to avoid the per-byte virtual dispatch overhead for large
|
||||||
|
/// textures (1 MiB frontbuffer = 1M dispatch calls).
|
||||||
|
pub fn read_guest_bytes(
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
addr: u32,
|
||||||
|
len: usize,
|
||||||
|
) -> Vec<u8> {
|
||||||
|
let mut out = Vec::with_capacity(len);
|
||||||
|
for i in 0..len {
|
||||||
|
let a = addr.wrapping_add(i as u32);
|
||||||
|
out.push(mem.read_u8(a));
|
||||||
|
if a < addr {
|
||||||
|
// 32-bit overflow; unmap the tail.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Byte-swap the 32-bit dwords of `buf` in place according to `endian`.
|
||||||
|
/// `buf.len()` should be a multiple of 4; tail bytes are left untouched.
|
||||||
|
pub fn apply_endian_32(buf: &mut [u8], endian: Endian) {
|
||||||
|
if matches!(endian, Endian::None) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let mut i = 0;
|
||||||
|
while i + 4 <= buf.len() {
|
||||||
|
let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]);
|
||||||
|
let swapped = endian.swap32(v);
|
||||||
|
buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes());
|
||||||
|
i += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes.
|
||||||
|
/// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory →
|
||||||
|
/// we emit RGBA for wgpu) and the declared endian swap, then detiles via
|
||||||
|
/// the Xenos Tiled2D formula.
|
||||||
|
pub fn decode_k8888_tiled(
|
||||||
|
key: &TextureKey,
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
) -> Result<Vec<u8>, DecodeError> {
|
||||||
|
if key.width == 0 || key.height == 0 {
|
||||||
|
return Err(DecodeError::ZeroSize);
|
||||||
|
}
|
||||||
|
let w = key.width as u32;
|
||||||
|
let h = key.height as u32;
|
||||||
|
let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
|
||||||
|
let total_bytes = (pitch_aligned * h * 4) as usize;
|
||||||
|
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||||
|
if raw.len() < total_bytes {
|
||||||
|
return Err(DecodeError::OutOfBounds);
|
||||||
|
}
|
||||||
|
apply_endian_32(&mut raw, key.endian);
|
||||||
|
let mut linear = vec![0u8; (w * h * 4) as usize];
|
||||||
|
if key.tiled {
|
||||||
|
if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() {
|
||||||
|
return Err(DecodeError::OutOfBounds);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Non-tiled copy row-by-row honoring pitch.
|
||||||
|
for y in 0..h as usize {
|
||||||
|
let src = y * (pitch_aligned as usize) * 4;
|
||||||
|
let dst = y * (w as usize) * 4;
|
||||||
|
linear[dst..dst + (w as usize) * 4]
|
||||||
|
.copy_from_slice(&raw[src..src + (w as usize) * 4]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After
|
||||||
|
// endian.Swap8In32 guests' typical per-dword byte order becomes BGRA
|
||||||
|
// in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu.
|
||||||
|
for px in linear.chunks_exact_mut(4) {
|
||||||
|
px.swap(0, 2);
|
||||||
|
}
|
||||||
|
Ok(linear)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode a DXT-compressed texture to raw block bytes (no format
|
||||||
|
/// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the
|
||||||
|
/// GPU does the actual decompression on upload).
|
||||||
|
///
|
||||||
|
/// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D
|
||||||
|
/// formula, with stride counted in blocks. `bytes_per_block` is 8 for
|
||||||
|
/// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5).
|
||||||
|
pub fn decode_dxt_tiled(
|
||||||
|
key: &TextureKey,
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
bytes_per_block: u32,
|
||||||
|
) -> Result<Vec<u8>, DecodeError> {
|
||||||
|
if key.width == 0 || key.height == 0 {
|
||||||
|
return Err(DecodeError::ZeroSize);
|
||||||
|
}
|
||||||
|
let block_w = 4u32;
|
||||||
|
let block_h = 4u32;
|
||||||
|
let w_blocks = (key.width as u32).div_ceil(block_w);
|
||||||
|
let h_blocks = (key.height as u32).div_ceil(block_h);
|
||||||
|
let pitch_blocks = tiled_address::align_pitch_to_macro_tile(
|
||||||
|
(key.pitch_texels as u32).div_ceil(block_w),
|
||||||
|
);
|
||||||
|
let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize;
|
||||||
|
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||||
|
if raw.len() < total_bytes {
|
||||||
|
return Err(DecodeError::OutOfBounds);
|
||||||
|
}
|
||||||
|
// DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar
|
||||||
|
// u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word
|
||||||
|
// order requires an endian swap at the u16/u32 level regardless of
|
||||||
|
// which BC-family format.
|
||||||
|
apply_endian_32(&mut raw, key.endian);
|
||||||
|
|
||||||
|
let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize];
|
||||||
|
if key.tiled {
|
||||||
|
if tiled_address::detile_2d(
|
||||||
|
&raw,
|
||||||
|
&mut out,
|
||||||
|
w_blocks,
|
||||||
|
h_blocks,
|
||||||
|
pitch_blocks,
|
||||||
|
bytes_per_block,
|
||||||
|
)
|
||||||
|
.is_err()
|
||||||
|
{
|
||||||
|
return Err(DecodeError::OutOfBounds);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for y in 0..h_blocks as usize {
|
||||||
|
let src = y * (pitch_blocks as usize) * (bytes_per_block as usize);
|
||||||
|
let dst = y * (w_blocks as usize) * (bytes_per_block as usize);
|
||||||
|
out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)]
|
||||||
|
.copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// BC1 / DXT1 — 8-byte blocks.
|
||||||
|
pub fn decode_dxt1_tiled(
|
||||||
|
key: &TextureKey,
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
) -> Result<Vec<u8>, DecodeError> {
|
||||||
|
decode_dxt_tiled(key, mem, 8)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// BC2 / DXT2_3 — 16-byte blocks.
|
||||||
|
pub fn decode_dxt23_tiled(
|
||||||
|
key: &TextureKey,
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
) -> Result<Vec<u8>, DecodeError> {
|
||||||
|
decode_dxt_tiled(key, mem, 16)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// BC3 / DXT4_5 — 16-byte blocks.
|
||||||
|
pub fn decode_dxt45_tiled(
|
||||||
|
key: &TextureKey,
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
) -> Result<Vec<u8>, DecodeError> {
|
||||||
|
decode_dxt_tiled(key, mem, 16)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high
|
||||||
|
/// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of
|
||||||
|
/// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a
|
||||||
|
/// sampled texture format on every backend, so CPU-side conversion is
|
||||||
|
/// the safe path even if it's 2× the texture memory.
|
||||||
|
///
|
||||||
|
/// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes),
|
||||||
|
/// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear
|
||||||
|
/// output buffer.
|
||||||
|
pub fn decode_k565_tiled(
|
||||||
|
key: &TextureKey,
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
) -> Result<Vec<u8>, DecodeError> {
|
||||||
|
if key.width == 0 || key.height == 0 {
|
||||||
|
return Err(DecodeError::ZeroSize);
|
||||||
|
}
|
||||||
|
let w = key.width as u32;
|
||||||
|
let h = key.height as u32;
|
||||||
|
// Pitch/block counts — block = 1 texel here, 2 bytes.
|
||||||
|
let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
|
||||||
|
let total_bytes = (pitch_aligned * h * 2) as usize;
|
||||||
|
let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
|
||||||
|
if raw.len() < total_bytes {
|
||||||
|
return Err(DecodeError::OutOfBounds);
|
||||||
|
}
|
||||||
|
// 16-bit word order is endian-swap-sensitive.
|
||||||
|
apply_endian_32(&mut raw, key.endian);
|
||||||
|
// Step 1: detile (bytes_per_block=2, tile in blocks=texels).
|
||||||
|
let mut linear_u16 = vec![0u8; (w * h * 2) as usize];
|
||||||
|
if key.tiled {
|
||||||
|
if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() {
|
||||||
|
return Err(DecodeError::OutOfBounds);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for y in 0..h as usize {
|
||||||
|
let src = y * (pitch_aligned as usize) * 2;
|
||||||
|
let dst = y * (w as usize) * 2;
|
||||||
|
linear_u16[dst..dst + (w as usize) * 2]
|
||||||
|
.copy_from_slice(&raw[src..src + (w as usize) * 2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16
|
||||||
|
// is little-endian after `apply_endian_32` has normalized the word
|
||||||
|
// order (we keep host-native byte ordering post-swap).
|
||||||
|
let mut rgba = vec![0u8; (w * h * 4) as usize];
|
||||||
|
for y in 0..h as usize {
|
||||||
|
for x in 0..w as usize {
|
||||||
|
let off = (y * w as usize + x) * 2;
|
||||||
|
let lo = linear_u16[off];
|
||||||
|
let hi = linear_u16[off + 1];
|
||||||
|
let word = u16::from_le_bytes([lo, hi]);
|
||||||
|
// 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4).
|
||||||
|
// Expand to full-range u8: replicate high bits into low
|
||||||
|
// (so 0b11111 → 0xFF, matching the standard 565→888 convention).
|
||||||
|
let r5 = ((word >> 11) & 0x1F) as u8;
|
||||||
|
let g6 = ((word >> 5) & 0x3F) as u8;
|
||||||
|
let b5 = (word & 0x1F) as u8;
|
||||||
|
let r = (r5 << 3) | (r5 >> 2);
|
||||||
|
let g = (g6 << 2) | (g6 >> 4);
|
||||||
|
let b = (b5 << 3) | (b5 >> 2);
|
||||||
|
let o = (y * w as usize + x) * 4;
|
||||||
|
rgba[o] = r;
|
||||||
|
rgba[o + 1] = g;
|
||||||
|
rgba[o + 2] = b;
|
||||||
|
rgba[o + 3] = 0xFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(rgba)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Version-aware CPU-side texture cache. Entries are keyed on
|
||||||
|
/// `TextureKey.hash` and carry a `version_when_uploaded` watermark against
|
||||||
|
/// the guest memory's page-version counter. `ensure_cached` queries
|
||||||
|
/// `GuestMemory::max_page_version` over the texture's byte span; if the
|
||||||
|
/// span has been written since cache time, the entry is re-decoded.
|
||||||
|
pub struct TextureCache {
|
||||||
|
entries: HashMap<TextureKey, CachedTexture>,
|
||||||
|
/// Monotonic counter of decodes performed — HUD surface.
|
||||||
|
pub decodes_total: u64,
|
||||||
|
/// Count of stale-miss re-decodes.
|
||||||
|
pub restale_total: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TextureCache {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TextureCache {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
entries: HashMap::new(),
|
||||||
|
decodes_total: 0,
|
||||||
|
restale_total: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.entries.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.entries.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> {
|
||||||
|
self.entries.get(key)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a cached (or freshly-decoded) texture. The caller supplies
|
||||||
|
/// the current guest-memory page version covering the texture span;
|
||||||
|
/// see [`max_page_version_for`].
|
||||||
|
pub fn ensure_cached(
|
||||||
|
&mut self,
|
||||||
|
key: TextureKey,
|
||||||
|
current_version: u64,
|
||||||
|
mem: &dyn xenia_memory::MemoryAccess,
|
||||||
|
) -> Result<&CachedTexture, DecodeError> {
|
||||||
|
// Fast path: fresh entry exists.
|
||||||
|
if let Some(e) = self.entries.get(&key) {
|
||||||
|
if e.version_when_uploaded >= current_version {
|
||||||
|
return Ok(self.entries.get(&key).unwrap());
|
||||||
|
}
|
||||||
|
self.restale_total += 1;
|
||||||
|
}
|
||||||
|
let bytes = match key.format {
|
||||||
|
TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?,
|
||||||
|
TextureFormat::K565 => decode_k565_tiled(&key, mem)?,
|
||||||
|
TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?,
|
||||||
|
TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?,
|
||||||
|
TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?,
|
||||||
|
_ => return Err(DecodeError::UnsupportedFormat),
|
||||||
|
};
|
||||||
|
self.decodes_total += 1;
|
||||||
|
let entry = CachedTexture {
|
||||||
|
key,
|
||||||
|
version_when_uploaded: current_version,
|
||||||
|
bytes,
|
||||||
|
};
|
||||||
|
self.entries.insert(key, entry);
|
||||||
|
Ok(self.entries.get(&key).unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn byte_budget(&self) -> usize {
|
||||||
|
self.entries.values().map(|e| e.byte_size()).sum()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
use std::cell::Cell;
|
||||||
|
|
||||||
|
struct FakeMem(Box<[Cell<u8>]>);
|
||||||
|
impl FakeMem {
|
||||||
|
fn from_vec(v: Vec<u8>) -> Self {
|
||||||
|
FakeMem(v.into_iter().map(Cell::new).collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl xenia_memory::MemoryAccess for FakeMem {
|
||||||
|
fn read_u8(&self, a: u32) -> u8 {
|
||||||
|
self.0.get(a as usize).map(|c| c.get()).unwrap_or(0)
|
||||||
|
}
|
||||||
|
fn read_u16(&self, a: u32) -> u16 {
|
||||||
|
u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)])
|
||||||
|
}
|
||||||
|
fn read_u32(&self, a: u32) -> u32 {
|
||||||
|
u32::from_be_bytes([
|
||||||
|
self.read_u8(a),
|
||||||
|
self.read_u8(a + 1),
|
||||||
|
self.read_u8(a + 2),
|
||||||
|
self.read_u8(a + 3),
|
||||||
|
])
|
||||||
|
}
|
||||||
|
fn read_u64(&self, a: u32) -> u64 {
|
||||||
|
u64::from_be_bytes([
|
||||||
|
self.read_u8(a),
|
||||||
|
self.read_u8(a + 1),
|
||||||
|
self.read_u8(a + 2),
|
||||||
|
self.read_u8(a + 3),
|
||||||
|
self.read_u8(a + 4),
|
||||||
|
self.read_u8(a + 5),
|
||||||
|
self.read_u8(a + 6),
|
||||||
|
self.read_u8(a + 7),
|
||||||
|
])
|
||||||
|
}
|
||||||
|
fn write_u8(&self, a: u32, v: u8) {
|
||||||
|
if let Some(slot) = self.0.get(a as usize) {
|
||||||
|
slot.set(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn write_u16(&self, a: u32, v: u16) {
|
||||||
|
let b = v.to_be_bytes();
|
||||||
|
self.write_u8(a, b[0]);
|
||||||
|
self.write_u8(a + 1, b[1]);
|
||||||
|
}
|
||||||
|
fn write_u32(&self, a: u32, v: u32) {
|
||||||
|
let b = v.to_be_bytes();
|
||||||
|
for i in 0..4 {
|
||||||
|
self.write_u8(a + i as u32, b[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn write_u64(&self, a: u32, v: u64) {
|
||||||
|
let b = v.to_be_bytes();
|
||||||
|
for i in 0..8 {
|
||||||
|
self.write_u8(a + i as u32, b[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn translate(&self, _: u32) -> Option<*const u8> {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
fn translate_mut(&self, _: u32) -> Option<*mut u8> {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn format_block_info_matches_canary_expectations() {
|
||||||
|
assert_eq!(
|
||||||
|
TextureFormat::K8888.block_info(),
|
||||||
|
BlockInfo::new(1, 1, 4)
|
||||||
|
);
|
||||||
|
assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8));
|
||||||
|
assert_eq!(
|
||||||
|
TextureFormat::Dxt4_5.block_info(),
|
||||||
|
BlockInfo::new(4, 4, 16)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn endian_swap_variants() {
|
||||||
|
assert_eq!(Endian::None.swap32(0x11223344), 0x11223344);
|
||||||
|
assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433);
|
||||||
|
assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211);
|
||||||
|
assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn decode_fetch_constant_rejects_empty() {
|
||||||
|
let z = [0u32; 6];
|
||||||
|
assert!(decode_fetch_constant(z).is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn decode_fetch_constant_parses_2d_k8888() {
|
||||||
|
// Build a synthetic k_8_8_8_8 2D texture fetch constant:
|
||||||
|
// dword0: pitch_5=40 (1280/32), tiled=1, type=2
|
||||||
|
// dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12
|
||||||
|
// dword2: width-1=1279, height-1=719
|
||||||
|
// dword5: dimension=1 (2D)
|
||||||
|
let d0 = 0x8000_0000 | (40u32 << 22) | 2;
|
||||||
|
let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32;
|
||||||
|
let d2 = 1279u32 | ((719u32) << 13);
|
||||||
|
let d5 = 1u32 << 9;
|
||||||
|
let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed");
|
||||||
|
assert_eq!(k.format, TextureFormat::K8888);
|
||||||
|
assert_eq!(k.endian, Endian::Swap8In32);
|
||||||
|
assert_eq!(k.width, 1280);
|
||||||
|
assert_eq!(k.height, 720);
|
||||||
|
assert_eq!(k.dimension, Dimension::D2);
|
||||||
|
assert!(k.tiled);
|
||||||
|
assert_eq!(k.pitch_texels, 1280);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn decode_k8888_roundtrip_linear() {
|
||||||
|
// Build a 4×4 non-tiled image with pitch=32 (one macro-tile row).
|
||||||
|
// Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a
|
||||||
|
// big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must
|
||||||
|
// be (x, y, y*4+x, 0xFF) in RGBA order.
|
||||||
|
let w = 4u32;
|
||||||
|
let h = 4u32;
|
||||||
|
let pitch = 32u32;
|
||||||
|
let mut bytes = vec![0u8; (pitch * h * 4) as usize];
|
||||||
|
for y in 0..h {
|
||||||
|
for x in 0..w {
|
||||||
|
let off = ((y * pitch + x) * 4) as usize;
|
||||||
|
let argb = (0xFFu32 << 24)
|
||||||
|
| ((x as u32) << 16)
|
||||||
|
| ((y as u32) << 8)
|
||||||
|
| ((y * 4 + x) as u32);
|
||||||
|
bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mem = FakeMem::from_vec(bytes);
|
||||||
|
let key = TextureKey {
|
||||||
|
base_address: 0,
|
||||||
|
width: 4,
|
||||||
|
height: 4,
|
||||||
|
depth_or_slices: 1,
|
||||||
|
format: TextureFormat::K8888,
|
||||||
|
endian: Endian::Swap8In32,
|
||||||
|
dimension: Dimension::D2,
|
||||||
|
tiled: false,
|
||||||
|
pitch_texels: pitch as u16,
|
||||||
|
};
|
||||||
|
let out = decode_k8888_tiled(&key, &mem).expect("decode");
|
||||||
|
assert_eq!(out.len(), 16 * 4);
|
||||||
|
assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
|
||||||
|
let off = ((3 * 4 + 3) * 4) as usize;
|
||||||
|
assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── First-Pixels M5 format tests ──────────────────────────────
|
||||||
|
|
||||||
|
/// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block.
|
||||||
|
/// Synthetic source of 0xDEADBEEF... bytes; assert the decoder
|
||||||
|
/// returns the same bytes (passthrough after endian swap).
|
||||||
|
#[test]
|
||||||
|
fn decode_dxt23_small_roundtrip() {
|
||||||
|
// 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32
|
||||||
|
// (macro-tile-aligned) the block pitch is 8 (=32/4), and we
|
||||||
|
// allocate 8*1*16 = 128 bytes of source.
|
||||||
|
let mut bytes = vec![0u8; 128];
|
||||||
|
for (i, b) in bytes.iter_mut().enumerate().take(16) {
|
||||||
|
*b = i as u8;
|
||||||
|
}
|
||||||
|
let mem = FakeMem::from_vec(bytes);
|
||||||
|
let key = TextureKey {
|
||||||
|
base_address: 0,
|
||||||
|
width: 4,
|
||||||
|
height: 4,
|
||||||
|
depth_or_slices: 1,
|
||||||
|
format: TextureFormat::Dxt2_3,
|
||||||
|
endian: Endian::None, // no swap — we can eyeball passthrough
|
||||||
|
dimension: Dimension::D2,
|
||||||
|
tiled: false,
|
||||||
|
pitch_texels: 32,
|
||||||
|
};
|
||||||
|
let out = decode_dxt23_tiled(&key, &mem).expect("decode");
|
||||||
|
assert_eq!(out.len(), 16); // 1 block × 16 bytes
|
||||||
|
for i in 0..16 {
|
||||||
|
assert_eq!(out[i], i as u8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a
|
||||||
|
/// parallel test prevents a regression that sneaks up via the
|
||||||
|
/// generic `decode_dxt_tiled`.
|
||||||
|
#[test]
|
||||||
|
fn decode_dxt45_uses_16byte_blocks() {
|
||||||
|
let mem = FakeMem::from_vec(vec![0xAAu8; 256]);
|
||||||
|
let key = TextureKey {
|
||||||
|
base_address: 0,
|
||||||
|
width: 8,
|
||||||
|
height: 4, // 2×1 blocks
|
||||||
|
depth_or_slices: 1,
|
||||||
|
format: TextureFormat::Dxt4_5,
|
||||||
|
endian: Endian::None,
|
||||||
|
dimension: Dimension::D2,
|
||||||
|
tiled: false,
|
||||||
|
pitch_texels: 32,
|
||||||
|
};
|
||||||
|
let out = decode_dxt45_tiled(&key, &mem).expect("decode");
|
||||||
|
assert_eq!(out.len(), 2 * 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// k_5_6_5: a single white texel (all bits set, 0xFFFF) should
|
||||||
|
/// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red
|
||||||
|
/// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0
|
||||||
|
/// B=0 via the high-bit-replicate convention.
|
||||||
|
#[test]
|
||||||
|
fn decode_k565_texel_expansion() {
|
||||||
|
// Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels
|
||||||
|
// → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white),
|
||||||
|
// texel[1] = 0xF800 (pure red).
|
||||||
|
let mut bytes = vec![0u8; 64];
|
||||||
|
// 0xFFFF
|
||||||
|
bytes[0] = 0xFF;
|
||||||
|
bytes[1] = 0xFF;
|
||||||
|
// 0xF800 (big-endian memory): high byte 0xF8, low 0x00.
|
||||||
|
// But after apply_endian_32(Endian::None) we use little-endian
|
||||||
|
// word decoding — so memory must carry the bytes in LE order.
|
||||||
|
bytes[2] = 0x00;
|
||||||
|
bytes[3] = 0xF8;
|
||||||
|
let mem = FakeMem::from_vec(bytes);
|
||||||
|
let key = TextureKey {
|
||||||
|
base_address: 0,
|
||||||
|
width: 2,
|
||||||
|
height: 1,
|
||||||
|
depth_or_slices: 1,
|
||||||
|
format: TextureFormat::K565,
|
||||||
|
endian: Endian::None,
|
||||||
|
dimension: Dimension::D2,
|
||||||
|
tiled: false,
|
||||||
|
pitch_texels: 32,
|
||||||
|
};
|
||||||
|
let out = decode_k565_tiled(&key, &mem).expect("decode");
|
||||||
|
assert_eq!(out.len(), 2 * 4);
|
||||||
|
// Texel 0: white.
|
||||||
|
assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]);
|
||||||
|
// Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF).
|
||||||
|
assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn is_host_supported_covers_m5_formats() {
|
||||||
|
assert!(TextureFormat::K8888.is_host_supported());
|
||||||
|
assert!(TextureFormat::K565.is_host_supported());
|
||||||
|
assert!(TextureFormat::Dxt1.is_host_supported());
|
||||||
|
assert!(TextureFormat::Dxt2_3.is_host_supported());
|
||||||
|
assert!(TextureFormat::Dxt4_5.is_host_supported());
|
||||||
|
// Unsupported formats should still report false.
|
||||||
|
assert!(!TextureFormat::K16.is_host_supported());
|
||||||
|
assert!(!TextureFormat::K32Float.is_host_supported());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn texture_cache_caches_and_reuses() {
|
||||||
|
let mut cache = TextureCache::new();
|
||||||
|
let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]);
|
||||||
|
let key = TextureKey {
|
||||||
|
base_address: 0,
|
||||||
|
width: 4,
|
||||||
|
height: 4,
|
||||||
|
depth_or_slices: 1,
|
||||||
|
format: TextureFormat::K8888,
|
||||||
|
endian: Endian::None,
|
||||||
|
dimension: Dimension::D2,
|
||||||
|
tiled: false,
|
||||||
|
pitch_texels: 32,
|
||||||
|
};
|
||||||
|
cache.ensure_cached(key, 0, &mem).unwrap();
|
||||||
|
assert_eq!(cache.decodes_total, 1);
|
||||||
|
// Same version: should hit cache.
|
||||||
|
cache.ensure_cached(key, 0, &mem).unwrap();
|
||||||
|
assert_eq!(cache.decodes_total, 1);
|
||||||
|
// Higher version: stale → re-decode.
|
||||||
|
cache.ensure_cached(key, 1, &mem).unwrap();
|
||||||
|
assert_eq!(cache.decodes_total, 2);
|
||||||
|
assert_eq!(cache.restale_total, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey`
|
||||||
|
/// → `ensure_cached` on fresh/version-bumped memory → stale re-decode.
|
||||||
|
/// Mirrors what `vd_swap` does per frame.
|
||||||
|
#[test]
|
||||||
|
fn e2e_fetch_const_to_cache_with_versioning() {
|
||||||
|
// 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned.
|
||||||
|
let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2
|
||||||
|
let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none
|
||||||
|
let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3
|
||||||
|
let d5 = 1u32 << 9; // 2D
|
||||||
|
let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded");
|
||||||
|
assert_eq!(key.format, TextureFormat::K8888);
|
||||||
|
assert_eq!(key.width, 4);
|
||||||
|
|
||||||
|
let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]);
|
||||||
|
let mut cache = TextureCache::new();
|
||||||
|
// v0 decode.
|
||||||
|
let first = cache
|
||||||
|
.ensure_cached(key, 0, &mem)
|
||||||
|
.expect("initial decode")
|
||||||
|
.clone();
|
||||||
|
// Same version → cache hit.
|
||||||
|
cache.ensure_cached(key, 0, &mem).expect("hit");
|
||||||
|
assert_eq!(cache.decodes_total, 1);
|
||||||
|
// Simulate the guest writing to the texture's pages: version bumps.
|
||||||
|
for b in &mem.0[..16] {
|
||||||
|
b.set(0xFF);
|
||||||
|
}
|
||||||
|
cache.ensure_cached(key, 1, &mem).expect("re-decode");
|
||||||
|
assert_eq!(cache.decodes_total, 2);
|
||||||
|
assert_eq!(cache.restale_total, 1);
|
||||||
|
// Bytes differ from v0 (proof the re-decode happened).
|
||||||
|
let second = cache.get(&key).unwrap();
|
||||||
|
assert_ne!(first.bytes, second.bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn texture_cache_rejects_unsupported_format() {
|
||||||
|
let mut cache = TextureCache::new();
|
||||||
|
let mem = FakeMem::from_vec(vec![0u8; 1024]);
|
||||||
|
let key = TextureKey {
|
||||||
|
base_address: 0,
|
||||||
|
width: 4,
|
||||||
|
height: 4,
|
||||||
|
depth_or_slices: 1,
|
||||||
|
format: TextureFormat::K16,
|
||||||
|
endian: Endian::None,
|
||||||
|
dimension: Dimension::D2,
|
||||||
|
tiled: false,
|
||||||
|
pitch_texels: 32,
|
||||||
|
};
|
||||||
|
assert!(matches!(
|
||||||
|
cache.ensure_cached(key, 0, &mem),
|
||||||
|
Err(DecodeError::UnsupportedFormat)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
178
crates/xenia-gpu/src/tiled_address.rs
Normal file
178
crates/xenia-gpu/src/tiled_address.rs
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
//! Xenos tiled-texture address formula (2D, Tiled2D layout).
|
||||||
|
//!
|
||||||
|
//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
|
||||||
|
//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
|
||||||
|
//! macro-tile pattern with bank+pipe interleave for its internal DRAM
|
||||||
|
//! banks; this formula inverts that so we can read pixels out in linear
|
||||||
|
//! order, given the tiled source buffer.
|
||||||
|
//!
|
||||||
|
//! We use this in two places during P4:
|
||||||
|
//! - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
|
||||||
|
//! - Any place we need to read tiled guest memory into a host-linear
|
||||||
|
//! buffer for CPU-side conversion before upload.
|
||||||
|
|
||||||
|
/// Tile size constants from canary.
|
||||||
|
pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
|
||||||
|
pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px
|
||||||
|
|
||||||
|
/// Canary's `TiledCombine` helper — reassembles the DRAM address from the
|
||||||
|
/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
|
||||||
|
#[inline]
|
||||||
|
fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
|
||||||
|
(y_lsb << 4)
|
||||||
|
| (pipe << 6)
|
||||||
|
| (bank << 11)
|
||||||
|
| (outer_inner_bytes & 0b1111)
|
||||||
|
| (((outer_inner_bytes >> 4) & 0b1) << 5)
|
||||||
|
| (((outer_inner_bytes >> 5) & 0b111) << 8)
|
||||||
|
| ((outer_inner_bytes >> 8) << 12)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 2D tiled offset in bytes from (x, y) into a tiled surface with
|
||||||
|
/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
|
||||||
|
/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
|
||||||
|
/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
|
||||||
|
/// `u32` rather than canary's signed `int` since our callers stay in
|
||||||
|
/// unsigned arithmetic.
|
||||||
|
///
|
||||||
|
/// This is the canonical formula — do not simplify without re-reading
|
||||||
|
/// `texture_address.h:190-208`; the bit-interleave cannot be expressed
|
||||||
|
/// as a linear function.
|
||||||
|
pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
|
||||||
|
let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
|
||||||
|
// Outer: which 32×32 macro tile we're in.
|
||||||
|
let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
|
||||||
|
+ (x >> MACRO_TILE_WIDTH_LOG2))
|
||||||
|
<< 6;
|
||||||
|
// Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
|
||||||
|
// because that bit becomes the `y_lsb` interleave bit below).
|
||||||
|
let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
|
||||||
|
let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;
|
||||||
|
|
||||||
|
let bank = (y >> 4) & 0b1;
|
||||||
|
let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
|
||||||
|
let y_lsb = y & 1;
|
||||||
|
|
||||||
|
tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
|
||||||
|
/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
|
||||||
|
#[inline]
|
||||||
|
pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
|
||||||
|
let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
|
||||||
|
(pitch_pixels + mask) & !mask
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Detile a 2D tiled surface into a linear destination buffer. The
|
||||||
|
/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
|
||||||
|
/// one block in the tiled source, and the detiler writes it into `dst`
|
||||||
|
/// at the linear (x, y) position.
|
||||||
|
///
|
||||||
|
/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
|
||||||
|
/// a single 16-bit block, etc.). `dst` must be at least
|
||||||
|
/// `width * height * bpp` bytes long.
|
||||||
|
///
|
||||||
|
/// Returns `Err(())` if the source doesn't contain enough bytes for the
|
||||||
|
/// largest offset the formula would produce (defensive — callers can
|
||||||
|
/// downgrade silently).
|
||||||
|
pub fn detile_2d(
|
||||||
|
src: &[u8],
|
||||||
|
dst: &mut [u8],
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
pitch_pixels: u32,
|
||||||
|
bpp: u32,
|
||||||
|
) -> Result<(), ()> {
|
||||||
|
let bpp_log2 = bpp.trailing_zeros();
|
||||||
|
let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
|
||||||
|
let dst_pitch_bytes = (width * bpp) as usize;
|
||||||
|
let bpp_u = bpp as usize;
|
||||||
|
|
||||||
|
for y in 0..height {
|
||||||
|
for x in 0..width {
|
||||||
|
let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
|
||||||
|
if src_off + bpp_u > src.len() {
|
||||||
|
return Err(());
|
||||||
|
}
|
||||||
|
let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
|
||||||
|
if dst_off + bpp_u > dst.len() {
|
||||||
|
return Err(());
|
||||||
|
}
|
||||||
|
dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
|
||||||
|
#[test]
|
||||||
|
fn origin_is_zero() {
|
||||||
|
assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Round-trip: detiling a tiled buffer that was filled using the same
|
||||||
|
/// formula produces the identity linear image.
|
||||||
|
#[test]
|
||||||
|
fn roundtrip_small_pattern() {
|
||||||
|
let w = 32u32;
|
||||||
|
let h = 16u32;
|
||||||
|
let bpp = 4u32;
|
||||||
|
let pitch = align_pitch_to_macro_tile(w);
|
||||||
|
// Allocate a tiled buffer large enough for the largest offset.
|
||||||
|
let max_off = (0..h)
|
||||||
|
.flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
|
||||||
|
.max()
|
||||||
|
.unwrap();
|
||||||
|
let mut tiled = vec![0u8; max_off];
|
||||||
|
// Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
|
||||||
|
// each logical (x, y) position in the tiled buffer.
|
||||||
|
for y in 0..h {
|
||||||
|
for x in 0..w {
|
||||||
|
let off = tiled_2d_offset(x, y, pitch, 2) as usize;
|
||||||
|
tiled[off + 0] = x as u8;
|
||||||
|
tiled[off + 1] = y as u8;
|
||||||
|
tiled[off + 2] = (x ^ y) as u8;
|
||||||
|
tiled[off + 3] = 0xFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut linear = vec![0u8; (w * h * bpp) as usize];
|
||||||
|
detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
|
||||||
|
// Verify every logical pixel landed at the right linear offset.
|
||||||
|
for y in 0..h {
|
||||||
|
for x in 0..w {
|
||||||
|
let lin = ((y * w + x) * bpp) as usize;
|
||||||
|
assert_eq!(linear[lin + 0], x as u8);
|
||||||
|
assert_eq!(linear[lin + 1], y as u8);
|
||||||
|
assert_eq!(linear[lin + 2], (x ^ y) as u8);
|
||||||
|
assert_eq!(linear[lin + 3], 0xFF);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Within a single macro-tile row, stepping `x` by 1 changes the low
|
||||||
|
/// 3 bits of `x` which feed the `inner_blocks` field — different
|
||||||
|
/// offsets are expected (no aliasing).
|
||||||
|
#[test]
|
||||||
|
fn neighbouring_pixels_have_distinct_offsets() {
|
||||||
|
let mut seen = std::collections::HashSet::new();
|
||||||
|
for y in 0..16 {
|
||||||
|
for x in 0..32 {
|
||||||
|
assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
|
||||||
|
/// rounds to 1312.
|
||||||
|
#[test]
|
||||||
|
fn align_pitch_rounds_up_to_32() {
|
||||||
|
assert_eq!(align_pitch_to_macro_tile(1280), 1280);
|
||||||
|
assert_eq!(align_pitch_to_macro_tile(1281), 1312);
|
||||||
|
assert_eq!(align_pitch_to_macro_tile(31), 32);
|
||||||
|
}
|
||||||
|
}
|
||||||
557
crates/xenia-gpu/src/translator.rs
Normal file
557
crates/xenia-gpu/src/translator.rs
Normal file
@@ -0,0 +1,557 @@
|
|||||||
|
//! Xenos → WGSL direct translator (P7).
|
||||||
|
//!
|
||||||
|
//! Replaces the runtime uber-shader interpreter (P3b/P3c) for shaders whose
|
||||||
|
//! feature set we cover. Emits a *standalone* WGSL module per shader
|
||||||
|
//! instead of walking a ucode buffer at draw time — pipeline compilation
|
||||||
|
//! happens once, then every subsequent dispatch is a direct `draw()`.
|
||||||
|
//!
|
||||||
|
//! The translator is deliberately narrow: when it encounters an opcode /
|
||||||
|
//! fetch format / CF shape it doesn't know, it returns [`None`] and the
|
||||||
|
//! caller falls back to the interpreter. This keeps the op-coverage work
|
||||||
|
//! incremental — future commits can add one opcode at a time without
|
||||||
|
//! invalidating the scaffolding.
|
||||||
|
//!
|
||||||
|
//! Current coverage (v1):
|
||||||
|
//! * Linear CF: `Exec`/`ExecEnd`, `Alloc`, `Exit`. No loops / branches /
|
||||||
|
//! calls / predicate-gated clauses.
|
||||||
|
//! * ALU vector: `ADD`, `MUL`, `MAX`, `MIN`, `MAD`, `DP4`, `DP3`,
|
||||||
|
//! `DP2_ADD`, `SEQ`, `SGT`, `SGE`, `SNE`, `FRC`, `FLOOR`.
|
||||||
|
//! * ALU scalar: `ADDS`, `MULS`, `MAXS`, `MINS`, `RCP`, `RETAIN_PREV`.
|
||||||
|
//! * Vertex fetch: `R32G32B32A32_FLOAT` only.
|
||||||
|
//! * Texture fetch: 2D via the single `@group(1)` slot (same one P5/M6
|
||||||
|
//! binds).
|
||||||
|
//! * Exports: VS writes position + interpolator 0 (color); PS writes
|
||||||
|
//! color0.
|
||||||
|
//!
|
||||||
|
//! When a shader exceeds this subset, [`translate`] returns `None` and
|
||||||
|
//! `gpu.shader.translate_reject{reason}` is bumped by the caller.
|
||||||
|
|
||||||
|
use crate::ucode::alu::{decode_alu, sop, vop, AluInstruction};
|
||||||
|
use crate::ucode::control_flow::{AllocKind, ControlFlowInstruction};
|
||||||
|
use crate::ucode::fetch::{decode_fetch, FetchInstruction};
|
||||||
|
use crate::ucode::ParsedShader;
|
||||||
|
|
||||||
|
/// Shader stage we're emitting for.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum Stage {
|
||||||
|
Vertex,
|
||||||
|
Pixel,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Success or refusal from the translator. On refusal, the caller falls
|
||||||
|
/// back to the runtime uber-shader interpreter.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum Translation {
|
||||||
|
/// The emitted WGSL body for *this stage only*. Both VS + PS get
|
||||||
|
/// wrapped into one module via [`combine_stages`].
|
||||||
|
Ok(String),
|
||||||
|
/// Translator saw an op/pattern it doesn't handle; fallback.
|
||||||
|
Reject(&'static str),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Full WGSL module for a (VS, PS) pair ready to hand to
|
||||||
|
/// `wgpu::Device::create_shader_module`. Shares the header across the two
|
||||||
|
/// bodies so bindings, struct declarations, and helpers aren't duplicated.
|
||||||
|
pub fn combine_stages(vs_body: &str, ps_body: &str) -> String {
|
||||||
|
let mut out = String::with_capacity(4096 + vs_body.len() + ps_body.len());
|
||||||
|
out.push_str(MODULE_HEADER);
|
||||||
|
out.push_str(vs_body);
|
||||||
|
out.push_str(ps_body);
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Translate a single shader stage. Returns `None` on any unsupported
|
||||||
|
/// feature with a short reason string that the caller plumbs into the
|
||||||
|
/// `gpu.shader.translate_reject{reason}` metric.
|
||||||
|
pub fn translate(parsed: &ParsedShader, stage: Stage) -> Translation {
|
||||||
|
let mut ctx = EmitCtx::new(stage);
|
||||||
|
// Emit the stage entry function body.
|
||||||
|
if let Err(reason) = ctx.emit_stage_body(parsed) {
|
||||||
|
return Translation::Reject(reason);
|
||||||
|
}
|
||||||
|
Translation::Ok(ctx.finish())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reject reasons; kept as static &'str for zero-alloc metrics.
|
||||||
|
pub mod reject {
|
||||||
|
pub const VEC_OP_UNSUPPORTED: &str = "vec_op_unsupported";
|
||||||
|
pub const SCL_OP_UNSUPPORTED: &str = "scl_op_unsupported";
|
||||||
|
pub const CF_LOOP: &str = "cf_loop";
|
||||||
|
pub const CF_COND: &str = "cf_cond";
|
||||||
|
pub const CF_CALL: &str = "cf_call";
|
||||||
|
pub const CF_UNKNOWN: &str = "cf_unknown";
|
||||||
|
pub const VFETCH_FMT: &str = "vfetch_fmt";
|
||||||
|
pub const TFETCH_NON2D: &str = "tfetch_non2d";
|
||||||
|
pub const INSTR_OOB: &str = "instr_oob";
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shader-module preamble (bindings, helpers, struct defs). The bindings
|
||||||
|
/// mirror the xenos pipeline's `@group(0)` + `@group(1)` layout from P5/M6
|
||||||
|
/// so we can use **the same bind-group slots** — only the pipeline object
|
||||||
|
/// differs between interpreter mode and translator mode.
|
||||||
|
const MODULE_HEADER: &str = r#"
|
||||||
|
struct XenosDrawConstants {
|
||||||
|
draw_index: u32,
|
||||||
|
vertex_count: u32,
|
||||||
|
prim_kind: u32,
|
||||||
|
_pad: u32,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct XenosConstants {
|
||||||
|
alu: array<vec4<f32>, 512>,
|
||||||
|
fetch: array<u32, 256>,
|
||||||
|
bool_consts: array<u32, 8>,
|
||||||
|
loop_consts: array<u32, 32>,
|
||||||
|
};
|
||||||
|
|
||||||
|
@group(0) @binding(0) var<uniform> draw_ctx : XenosDrawConstants;
|
||||||
|
@group(0) @binding(1) var<storage, read> xenos_consts : XenosConstants;
|
||||||
|
@group(0) @binding(2) var<storage, read> vs_ucode : array<u32>;
|
||||||
|
@group(0) @binding(3) var<storage, read> ps_ucode : array<u32>;
|
||||||
|
@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
|
||||||
|
|
||||||
|
@group(1) @binding(0) var xenos_tex : texture_2d<f32>;
|
||||||
|
@group(1) @binding(1) var xenos_samp : sampler;
|
||||||
|
|
||||||
|
struct VsOut {
|
||||||
|
@builtin(position) position: vec4<f32>,
|
||||||
|
@location(0) color: vec4<f32>,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FsOut {
|
||||||
|
@location(0) color0: vec4<f32>,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper: reciprocal guarded against divide-by-zero.
|
||||||
|
fn xe_rcp(x: f32) -> f32 {
|
||||||
|
return select(0.0, 1.0 / x, x != 0.0);
|
||||||
|
}
|
||||||
|
"#;
|
||||||
|
|
||||||
|
struct EmitCtx {
|
||||||
|
stage: Stage,
|
||||||
|
out: String,
|
||||||
|
indent: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EmitCtx {
|
||||||
|
fn new(stage: Stage) -> Self {
|
||||||
|
Self {
|
||||||
|
stage,
|
||||||
|
out: String::with_capacity(2048),
|
||||||
|
indent: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn finish(self) -> String {
|
||||||
|
self.out
|
||||||
|
}
|
||||||
|
|
||||||
|
fn push(&mut self, s: &str) {
|
||||||
|
for _ in 0..self.indent {
|
||||||
|
self.out.push_str(" ");
|
||||||
|
}
|
||||||
|
self.out.push_str(s);
|
||||||
|
self.out.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_stage_body(&mut self, parsed: &ParsedShader) -> Result<(), &'static str> {
|
||||||
|
// Entry function + struct header.
|
||||||
|
match self.stage {
|
||||||
|
Stage::Vertex => {
|
||||||
|
self.push("@vertex");
|
||||||
|
self.push("fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {");
|
||||||
|
}
|
||||||
|
Stage::Pixel => {
|
||||||
|
self.push("@fragment");
|
||||||
|
self.push("fn fs_main(in: VsOut) -> FsOut {");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.indent = 1;
|
||||||
|
// Register file + ps chain + export slots. All local `var`s so each
|
||||||
|
// invocation gets its own state; translator-emitted code doesn't
|
||||||
|
// need `var<private>` because we don't share across function calls.
|
||||||
|
self.push("var r: array<vec4<f32>, 128>;");
|
||||||
|
self.push("for (var i = 0u; i < 128u; i = i + 1u) { r[i] = vec4<f32>(0.0); }");
|
||||||
|
self.push("var ps: f32 = 0.0;");
|
||||||
|
match self.stage {
|
||||||
|
Stage::Vertex => {
|
||||||
|
// Seed r0 with vertex index for simple shaders that read it.
|
||||||
|
self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
|
||||||
|
// Synthetic export slots — match the interpreter's layout so
|
||||||
|
// the fallback path and translator path produce the same
|
||||||
|
// visual output on shaders both support.
|
||||||
|
self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
|
||||||
|
self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
|
||||||
|
}
|
||||||
|
Stage::Pixel => {
|
||||||
|
// Seed r0.xy with interpolated color lane so trivial shaders
|
||||||
|
// that read r0 still produce something.
|
||||||
|
self.push("r[0] = in.color;");
|
||||||
|
self.push("var ocolor0: vec4<f32> = in.color;");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut current_alloc = AllocKind::Other;
|
||||||
|
for clause in &parsed.cf {
|
||||||
|
match clause {
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address,
|
||||||
|
count,
|
||||||
|
sequence,
|
||||||
|
is_end,
|
||||||
|
predicated,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
if *predicated {
|
||||||
|
return Err(reject::CF_COND);
|
||||||
|
}
|
||||||
|
self.emit_exec(parsed, *address, *count, *sequence, current_alloc)?;
|
||||||
|
if *is_end {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::Alloc { kind, .. } => {
|
||||||
|
current_alloc = *kind;
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::Exit => break,
|
||||||
|
ControlFlowInstruction::LoopStart { .. }
|
||||||
|
| ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
|
||||||
|
ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
|
||||||
|
ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
|
||||||
|
return Err(reject::CF_CALL);
|
||||||
|
}
|
||||||
|
ControlFlowInstruction::Unknown { .. } => return Err(reject::CF_UNKNOWN),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match self.stage {
|
||||||
|
Stage::Vertex => {
|
||||||
|
self.push("var out: VsOut;");
|
||||||
|
self.push("out.position = opos;");
|
||||||
|
self.push("out.color = ocolor;");
|
||||||
|
self.push("return out;");
|
||||||
|
}
|
||||||
|
Stage::Pixel => {
|
||||||
|
self.push("var out: FsOut;");
|
||||||
|
self.push("out.color0 = ocolor0;");
|
||||||
|
self.push("return out;");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.indent = 0;
|
||||||
|
self.push("}");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_exec(
|
||||||
|
&mut self,
|
||||||
|
parsed: &ParsedShader,
|
||||||
|
address: u32,
|
||||||
|
count: u32,
|
||||||
|
sequence: u32,
|
||||||
|
current_alloc: AllocKind,
|
||||||
|
) -> Result<(), &'static str> {
|
||||||
|
for i in 0..(count as usize) {
|
||||||
|
let triple_idx = address as usize + i;
|
||||||
|
let base = triple_idx * 3;
|
||||||
|
if base + 2 >= parsed.instructions.len() {
|
||||||
|
return Err(reject::INSTR_OOB);
|
||||||
|
}
|
||||||
|
let words = [
|
||||||
|
parsed.instructions[base],
|
||||||
|
parsed.instructions[base + 1],
|
||||||
|
parsed.instructions[base + 2],
|
||||||
|
];
|
||||||
|
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||||
|
if is_fetch {
|
||||||
|
match decode_fetch(words) {
|
||||||
|
FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
|
||||||
|
FetchInstruction::Texture(tf) => {
|
||||||
|
if tf.dimension != 1 {
|
||||||
|
return Err(reject::TFETCH_NON2D);
|
||||||
|
}
|
||||||
|
self.emit_tfetch(&tf);
|
||||||
|
}
|
||||||
|
FetchInstruction::Unknown { .. } => return Err(reject::VFETCH_FMT),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let alu = decode_alu(words);
|
||||||
|
self.emit_alu(&alu, current_alloc)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_alu(
|
||||||
|
&mut self,
|
||||||
|
alu: &AluInstruction,
|
||||||
|
current_alloc: AllocKind,
|
||||||
|
) -> Result<(), &'static str> {
|
||||||
|
let a = format!("r[{}u]", alu.src_a & 0x7F);
|
||||||
|
let b = format!("r[{}u]", alu.src_b & 0x7F);
|
||||||
|
let c = format!("r[{}u]", alu.src_c & 0x7F);
|
||||||
|
|
||||||
|
// Vector pipe.
|
||||||
|
if alu.vector_write_mask != 0 {
|
||||||
|
let expr = vector_expr(alu.vector_opcode, &a, &b, &c)
|
||||||
|
.ok_or(reject::VEC_OP_UNSUPPORTED)?;
|
||||||
|
let dst_reg = alu.vector_dest & 0x7F;
|
||||||
|
if alu.vector_dest_is_export {
|
||||||
|
self.emit_export(dst_reg, current_alloc, &expr, alu.vector_write_mask);
|
||||||
|
} else {
|
||||||
|
self.emit_masked_write(&format!("r[{dst_reg}u]"), &expr, alu.vector_write_mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scalar pipe. Binary ops use (src_a.x, src_b.x); ps-variants use
|
||||||
|
// src_a.x + running ps. `scl_src_a` mirrors the interpreter's
|
||||||
|
// `scalar_src_is_ps` selector.
|
||||||
|
let scl_src_a = if alu.scalar_src_is_ps {
|
||||||
|
"ps".to_string()
|
||||||
|
} else {
|
||||||
|
format!("{}.x", a)
|
||||||
|
};
|
||||||
|
let scl_src_b = format!("{}.x", b);
|
||||||
|
let expr = scalar_expr(alu.scalar_opcode, &scl_src_a, &scl_src_b, "ps")
|
||||||
|
.ok_or(reject::SCL_OP_UNSUPPORTED)?;
|
||||||
|
self.push(&format!("ps = {expr};"));
|
||||||
|
if alu.scalar_write_mask != 0 {
|
||||||
|
let v = "vec4<f32>(ps, ps, ps, ps)";
|
||||||
|
let dst_reg = alu.scalar_dest & 0x7F;
|
||||||
|
self.emit_masked_write(&format!("r[{dst_reg}u]"), v, alu.scalar_write_mask);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_masked_write(&mut self, lhs: &str, rhs: &str, mask: u8) {
|
||||||
|
if mask == 0xF {
|
||||||
|
self.push(&format!("{lhs} = {rhs};"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
self.push(&"{".to_string());
|
||||||
|
self.indent += 1;
|
||||||
|
self.push(&format!("let _prev = {lhs};"));
|
||||||
|
self.push(&format!("let _new = {rhs};"));
|
||||||
|
let mut components = Vec::new();
|
||||||
|
let letters = ['x', 'y', 'z', 'w'];
|
||||||
|
for (i, c) in letters.iter().enumerate() {
|
||||||
|
if (mask >> i) & 1 == 1 {
|
||||||
|
components.push(format!("_new.{c}"));
|
||||||
|
} else {
|
||||||
|
components.push(format!("_prev.{c}"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.push(&format!(
|
||||||
|
"{lhs} = vec4<f32>({}, {}, {}, {});",
|
||||||
|
components[0], components[1], components[2], components[3]
|
||||||
|
));
|
||||||
|
self.indent -= 1;
|
||||||
|
self.push("}");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
|
||||||
|
// Xenos's export "register" indexing within an alloc range is
|
||||||
|
// normally (alloc_base + offset). Since our CF stream doesn't
|
||||||
|
// carry per-export slot offsets cleanly, use `alloc` to pick the
|
||||||
|
// target.
|
||||||
|
let lhs = match (self.stage, alloc) {
|
||||||
|
(Stage::Vertex, AllocKind::Position) => "opos",
|
||||||
|
(Stage::Vertex, AllocKind::Interpolators) => "ocolor",
|
||||||
|
(Stage::Vertex, AllocKind::Colors) => "ocolor",
|
||||||
|
(Stage::Vertex, _) => "ocolor", // fall through — any other alloc
|
||||||
|
(Stage::Pixel, AllocKind::Colors) => "ocolor0",
|
||||||
|
(Stage::Pixel, _) => "ocolor0",
|
||||||
|
};
|
||||||
|
let _ = dst_reg; // per-slot export indexing reserved for a richer v2
|
||||||
|
self.emit_masked_write(lhs, expr, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
|
||||||
|
// v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
|
||||||
|
// dwords. Matches the interpreter's MVP semantics; unlocks more
|
||||||
|
// formats alongside the CPU texture cache's format expansion.
|
||||||
|
let fetch_const = (vf.raw[0] >> 5) & 0x1F;
|
||||||
|
let src_reg = vf.src_register & 0x7F;
|
||||||
|
let dst_reg = vf.dest_register & 0x7F;
|
||||||
|
self.push(&format!(
|
||||||
|
"{{ let fc0 = xenos_consts.fetch[{}u]; \
|
||||||
|
let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
|
||||||
|
let vidx = u32(r[{src_reg}u].x); \
|
||||||
|
let addr = base + vidx * 4u; \
|
||||||
|
let n = arrayLength(&vertex_buffer); \
|
||||||
|
if (addr + 3u < n) {{ \
|
||||||
|
r[{dst_reg}u] = vec4<f32>( \
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 0u]), \
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 1u]), \
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 2u]), \
|
||||||
|
bitcast<f32>(vertex_buffer[addr + 3u])); \
|
||||||
|
}} }}",
|
||||||
|
fetch_const * 2,
|
||||||
|
));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn emit_tfetch(&mut self, tf: &crate::ucode::fetch::TextureFetch) {
|
||||||
|
// v1: sample the single bound texture; UV = r[src].xy. P5's cache
|
||||||
|
// publishes the `fetch_const=0` texture into `@group(1)`; slot
|
||||||
|
// mismatch is a silent magenta for now.
|
||||||
|
let src_reg = tf.src_register & 0x7F;
|
||||||
|
let dst_reg = tf.dest_register & 0x7F;
|
||||||
|
self.push(&format!(
|
||||||
|
"r[{dst_reg}u] = textureSampleLevel(xenos_tex, xenos_samp, r[{src_reg}u].xy, 0.0);"
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
|
||||||
|
let s = match op {
|
||||||
|
vop::ADD => format!("({a} + {b})"),
|
||||||
|
vop::MUL => format!("({a} * {b})"),
|
||||||
|
vop::MAX => format!("max({a}, {b})"),
|
||||||
|
vop::MIN => format!("min({a}, {b})"),
|
||||||
|
vop::MAD => format!("({a} * {b} + {c})"),
|
||||||
|
vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
|
||||||
|
vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
|
||||||
|
vop::DOT2_ADD => format!(
|
||||||
|
"vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"
|
||||||
|
),
|
||||||
|
vop::SEQ => format!(
|
||||||
|
"vec4<f32>(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))"
|
||||||
|
),
|
||||||
|
vop::SGT => format!(
|
||||||
|
"vec4<f32>(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))"
|
||||||
|
),
|
||||||
|
vop::SGE => format!(
|
||||||
|
"vec4<f32>(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))"
|
||||||
|
),
|
||||||
|
vop::SNE => format!(
|
||||||
|
"vec4<f32>(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))"
|
||||||
|
),
|
||||||
|
vop::FRC => format!("fract({a})"),
|
||||||
|
vop::FLOOR => format!("floor({a})"),
|
||||||
|
_ => return None,
|
||||||
|
};
|
||||||
|
Some(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
|
||||||
|
let s = match op {
|
||||||
|
sop::ADDS => format!("({a} + {b})"),
|
||||||
|
sop::ADDS_PREV => format!("({a} + {prev})"),
|
||||||
|
sop::MULS => format!("({a} * {b})"),
|
||||||
|
sop::MULS_PREV => format!("({a} * {prev})"),
|
||||||
|
sop::MAXS => format!("max({a}, {b})"),
|
||||||
|
sop::MINS => format!("min({a}, {b})"),
|
||||||
|
sop::RCP => format!("xe_rcp({a})"),
|
||||||
|
sop::RETAIN_PREV => prev.to_string(),
|
||||||
|
_ => return None,
|
||||||
|
};
|
||||||
|
Some(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::ucode::alu::{sop, vop};
|
||||||
|
use crate::ucode::control_flow::ControlFlowInstruction;
|
||||||
|
|
||||||
|
fn synthetic_trivial_shader() -> ParsedShader {
|
||||||
|
// Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
|
||||||
|
// with full write-mask on vector, zero on scalar. Alloc(Position)
|
||||||
|
// precedes so the ALU's export (if it were one) would target oPos.
|
||||||
|
let w2 = (vop::ADD as u32)
|
||||||
|
| ((sop::RETAIN_PREV as u32) << 6)
|
||||||
|
| (0xF << 12) // vector_write_mask
|
||||||
|
| (0u32 << 16); // vector_dest = 0
|
||||||
|
ParsedShader {
|
||||||
|
cf: vec![
|
||||||
|
ControlFlowInstruction::Alloc {
|
||||||
|
size: 1,
|
||||||
|
kind: AllocKind::Position,
|
||||||
|
},
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address: 0,
|
||||||
|
count: 1,
|
||||||
|
sequence: 0,
|
||||||
|
is_end: true,
|
||||||
|
predicated: false,
|
||||||
|
predicate_condition: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
instructions: vec![0, 0, w2],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn trivial_shader_translates() {
|
||||||
|
let shader = synthetic_trivial_shader();
|
||||||
|
match translate(&shader, Stage::Vertex) {
|
||||||
|
Translation::Ok(body) => {
|
||||||
|
assert!(body.contains("fn vs_main"));
|
||||||
|
assert!(body.contains("r[0u] = (r[0u] + r[0u]);"));
|
||||||
|
assert!(body.contains("return out;"));
|
||||||
|
}
|
||||||
|
Translation::Reject(r) => panic!("rejected: {r}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn combined_module_parses_as_wgsl() {
|
||||||
|
let shader = synthetic_trivial_shader();
|
||||||
|
let vs = match translate(&shader, Stage::Vertex) {
|
||||||
|
Translation::Ok(body) => body,
|
||||||
|
Translation::Reject(r) => panic!("VS rejected: {r}"),
|
||||||
|
};
|
||||||
|
let ps = match translate(&shader, Stage::Pixel) {
|
||||||
|
Translation::Ok(body) => body,
|
||||||
|
Translation::Reject(r) => panic!("PS rejected: {r}"),
|
||||||
|
};
|
||||||
|
let module = combine_stages(&vs, &ps);
|
||||||
|
// naga is pinned as a dev-dep in this crate; if this fails the
|
||||||
|
// translator is emitting invalid WGSL.
|
||||||
|
match naga::front::wgsl::parse_str(&module) {
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(e) => panic!(
|
||||||
|
"emitted WGSL failed to parse:\n{}\n--- module ---\n{}",
|
||||||
|
e, module
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn loop_clause_rejected() {
|
||||||
|
let shader = ParsedShader {
|
||||||
|
cf: vec![ControlFlowInstruction::LoopStart {
|
||||||
|
address: 0,
|
||||||
|
loop_id: 0,
|
||||||
|
}],
|
||||||
|
instructions: vec![],
|
||||||
|
};
|
||||||
|
assert!(matches!(
|
||||||
|
translate(&shader, Stage::Vertex),
|
||||||
|
Translation::Reject(reject::CF_LOOP)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unsupported_op_rejected() {
|
||||||
|
let w2 = (29u32) // VOP_MAX_A, not in v1 subset
|
||||||
|
| ((sop::RETAIN_PREV as u32) << 6)
|
||||||
|
| (0xF << 12);
|
||||||
|
let shader = ParsedShader {
|
||||||
|
cf: vec![ControlFlowInstruction::Exec {
|
||||||
|
address: 0,
|
||||||
|
count: 1,
|
||||||
|
sequence: 0,
|
||||||
|
is_end: true,
|
||||||
|
predicated: false,
|
||||||
|
predicate_condition: false,
|
||||||
|
}],
|
||||||
|
instructions: vec![0, 0, w2],
|
||||||
|
};
|
||||||
|
assert!(matches!(
|
||||||
|
translate(&shader, Stage::Vertex),
|
||||||
|
Translation::Reject(reject::VEC_OP_UNSUPPORTED)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
206
crates/xenia-gpu/src/ucode/alu.rs
Normal file
206
crates/xenia-gpu/src/ucode/alu.rs
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
//! Xenos ALU (vector + scalar) instruction decoder.
|
||||||
|
//!
|
||||||
|
//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
|
||||||
|
//! - word0: operand modifier flags + destination info
|
||||||
|
//! - word1: source register / swizzle fields
|
||||||
|
//! - word2: opcode + write mask + export target
|
||||||
|
//!
|
||||||
|
//! See `ucode.h:900-1400` for the full field map. This decoder captures the
|
||||||
|
//! minimal shape the uber-shader needs; flags we don't interpret yet are
|
||||||
|
//! retained as raw bits in `raw` for downstream inspection.
|
||||||
|
|
||||||
|
/// Decoded ALU instruction.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct AluInstruction {
|
||||||
|
/// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
|
||||||
|
pub vector_opcode: u8,
|
||||||
|
/// Scalar ALU opcode (bits 7..13 of word2).
|
||||||
|
pub scalar_opcode: u8,
|
||||||
|
/// Destination register index for vector result (7 bits).
|
||||||
|
pub vector_dest: u8,
|
||||||
|
/// Destination register index for scalar result (7 bits).
|
||||||
|
pub scalar_dest: u8,
|
||||||
|
/// 4-bit write mask for the vector result (x/y/z/w).
|
||||||
|
pub vector_write_mask: u8,
|
||||||
|
/// 4-bit write mask for the scalar result.
|
||||||
|
pub scalar_write_mask: u8,
|
||||||
|
/// Set when the instruction should write to the export bank (position,
|
||||||
|
/// interpolators, color, etc.) instead of the general register file.
|
||||||
|
pub vector_dest_is_export: bool,
|
||||||
|
/// Selects `ps` (previous scalar result) as the scalar operand when set.
|
||||||
|
pub scalar_src_is_ps: bool,
|
||||||
|
/// Source register indices (at most 3 for vector ops).
|
||||||
|
pub src_a: u8,
|
||||||
|
pub src_b: u8,
|
||||||
|
pub src_c: u8,
|
||||||
|
/// Set when the instruction is predicated; skipped if the predicate
|
||||||
|
/// doesn't match `predicate_condition`.
|
||||||
|
pub predicated: bool,
|
||||||
|
pub predicate_condition: bool,
|
||||||
|
/// Raw dwords — preserved verbatim so the translator / interpreter can
|
||||||
|
/// reach into fields we haven't parsed explicitly yet.
|
||||||
|
pub raw: [u32; 3],
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode a 3-dword ALU triple.
|
||||||
|
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||||
|
let w0 = words[0];
|
||||||
|
let _w1 = words[1];
|
||||||
|
let w2 = words[2];
|
||||||
|
AluInstruction {
|
||||||
|
vector_opcode: (w2 & 0x3F) as u8,
|
||||||
|
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
|
||||||
|
vector_dest: ((w2 >> 16) & 0x7F) as u8,
|
||||||
|
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
|
||||||
|
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
|
||||||
|
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
|
||||||
|
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
|
||||||
|
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
|
||||||
|
src_a: (w0 & 0xFF) as u8,
|
||||||
|
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||||
|
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||||
|
predicated: ((w0 >> 27) & 1) != 0,
|
||||||
|
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||||
|
raw: words,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Vector ALU opcodes we reference by name. Values match canary's
|
||||||
|
/// `AluVectorOpcode` enum in `ucode.h:1354`.
|
||||||
|
pub mod vop {
|
||||||
|
pub const ADD: u8 = 0;
|
||||||
|
pub const MUL: u8 = 1;
|
||||||
|
pub const MAX: u8 = 2;
|
||||||
|
pub const MIN: u8 = 3;
|
||||||
|
pub const SEQ: u8 = 4;
|
||||||
|
pub const SGT: u8 = 5;
|
||||||
|
pub const SGE: u8 = 6;
|
||||||
|
pub const SNE: u8 = 7;
|
||||||
|
pub const FRC: u8 = 8;
|
||||||
|
pub const TRUNC: u8 = 9;
|
||||||
|
pub const FLOOR: u8 = 10;
|
||||||
|
pub const MAD: u8 = 11;
|
||||||
|
pub const CND_EQ: u8 = 12;
|
||||||
|
pub const CND_GE: u8 = 13;
|
||||||
|
pub const CND_GT: u8 = 14;
|
||||||
|
pub const DOT4: u8 = 15;
|
||||||
|
pub const DOT3: u8 = 16;
|
||||||
|
pub const DOT2_ADD: u8 = 17;
|
||||||
|
pub const CUBE: u8 = 18;
|
||||||
|
pub const MAX4: u8 = 19;
|
||||||
|
pub const SETP_EQ_PUSH: u8 = 20;
|
||||||
|
pub const SETP_NE_PUSH: u8 = 21;
|
||||||
|
pub const SETP_GT_PUSH: u8 = 22;
|
||||||
|
pub const SETP_GE_PUSH: u8 = 23;
|
||||||
|
pub const KILL_EQ: u8 = 24;
|
||||||
|
pub const KILL_GT: u8 = 25;
|
||||||
|
pub const KILL_GE: u8 = 26;
|
||||||
|
pub const KILL_NE: u8 = 27;
|
||||||
|
pub const DST: u8 = 28;
|
||||||
|
pub const MAX_A: u8 = 29;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
|
||||||
|
/// `ucode.h:1001`.
|
||||||
|
pub mod sop {
|
||||||
|
pub const ADDS: u8 = 0;
|
||||||
|
pub const ADDS_PREV: u8 = 1;
|
||||||
|
pub const MULS: u8 = 2;
|
||||||
|
pub const MULS_PREV: u8 = 3;
|
||||||
|
pub const MULS_PREV2: u8 = 4;
|
||||||
|
pub const MAXS: u8 = 5;
|
||||||
|
pub const MINS: u8 = 6;
|
||||||
|
pub const SEQS: u8 = 7;
|
||||||
|
pub const SGTS: u8 = 8;
|
||||||
|
pub const SGES: u8 = 9;
|
||||||
|
pub const SNES: u8 = 10;
|
||||||
|
pub const FRCS: u8 = 11;
|
||||||
|
pub const TRUNCS: u8 = 12;
|
||||||
|
pub const FLOORS: u8 = 13;
|
||||||
|
pub const EXP: u8 = 14;
|
||||||
|
pub const LOGC: u8 = 15;
|
||||||
|
pub const LOG: u8 = 16;
|
||||||
|
pub const RCPC: u8 = 17;
|
||||||
|
pub const RCPF: u8 = 18;
|
||||||
|
pub const RCP: u8 = 19;
|
||||||
|
pub const RSQC: u8 = 20;
|
||||||
|
pub const RSQF: u8 = 21;
|
||||||
|
pub const RSQ: u8 = 22;
|
||||||
|
pub const MAXAS: u8 = 23;
|
||||||
|
pub const MAXASF: u8 = 24;
|
||||||
|
pub const SUBS: u8 = 25;
|
||||||
|
pub const SUBS_PREV: u8 = 26;
|
||||||
|
pub const SETP_EQ: u8 = 27;
|
||||||
|
pub const SETP_NE: u8 = 28;
|
||||||
|
pub const SETP_GT: u8 = 29;
|
||||||
|
pub const SETP_GE: u8 = 30;
|
||||||
|
pub const SETP_INV: u8 = 31;
|
||||||
|
pub const SETP_POP: u8 = 32;
|
||||||
|
pub const SETP_CLR: u8 = 33;
|
||||||
|
pub const SETP_RSTR: u8 = 34;
|
||||||
|
pub const KILLS_EQ: u8 = 35;
|
||||||
|
pub const KILLS_GT: u8 = 36;
|
||||||
|
pub const KILLS_GE: u8 = 37;
|
||||||
|
pub const KILLS_NE: u8 = 38;
|
||||||
|
pub const KILLS_ONE: u8 = 39;
|
||||||
|
pub const SQRT: u8 = 40;
|
||||||
|
pub const MULSC0: u8 = 42;
|
||||||
|
pub const MULSC1: u8 = 43;
|
||||||
|
pub const ADDSC0: u8 = 44;
|
||||||
|
pub const ADDSC1: u8 = 45;
|
||||||
|
pub const SUBSC0: u8 = 46;
|
||||||
|
pub const SUBSC1: u8 = 47;
|
||||||
|
pub const SIN: u8 = 48;
|
||||||
|
pub const COS: u8 = 49;
|
||||||
|
pub const RETAIN_PREV: u8 = 50;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Regression: our table previously drifted from canary's values (e.g.
|
||||||
|
/// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
|
||||||
|
/// the most-often-used scalar + vector opcodes here.
|
||||||
|
#[test]
|
||||||
|
fn opcodes_match_canary_values() {
|
||||||
|
// Scalar.
|
||||||
|
assert_eq!(sop::MAXS, 5);
|
||||||
|
assert_eq!(sop::MINS, 6);
|
||||||
|
assert_eq!(sop::SEQS, 7);
|
||||||
|
assert_eq!(sop::EXP, 14);
|
||||||
|
assert_eq!(sop::LOG, 16);
|
||||||
|
assert_eq!(sop::RCP, 19);
|
||||||
|
assert_eq!(sop::RSQ, 22);
|
||||||
|
assert_eq!(sop::SUBS, 25);
|
||||||
|
assert_eq!(sop::SETP_EQ, 27);
|
||||||
|
assert_eq!(sop::KILLS_EQ, 35);
|
||||||
|
assert_eq!(sop::SQRT, 40);
|
||||||
|
assert_eq!(sop::SIN, 48);
|
||||||
|
assert_eq!(sop::RETAIN_PREV, 50);
|
||||||
|
// Vector.
|
||||||
|
assert_eq!(vop::SNE, 7);
|
||||||
|
assert_eq!(vop::CND_EQ, 12);
|
||||||
|
assert_eq!(vop::MAX4, 19);
|
||||||
|
assert_eq!(vop::KILL_EQ, 24);
|
||||||
|
assert_eq!(vop::DST, 28);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn decode_extracts_opcodes_and_dests() {
|
||||||
|
// Build a minimal ALU word:
|
||||||
|
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
|
||||||
|
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
|
||||||
|
let w2 = (vop::ADD as u32)
|
||||||
|
| ((sop::RCP as u32) << 6)
|
||||||
|
| (0xF << 12) // vector_write_mask
|
||||||
|
| (3u32 << 16) // vector_dest
|
||||||
|
| (7u32 << 24); // scalar_dest
|
||||||
|
let alu = decode_alu([0, 0, w2]);
|
||||||
|
assert_eq!(alu.vector_opcode, vop::ADD);
|
||||||
|
assert_eq!(alu.scalar_opcode, sop::RCP);
|
||||||
|
assert_eq!(alu.vector_dest, 3);
|
||||||
|
assert_eq!(alu.scalar_dest, 7);
|
||||||
|
assert_eq!(alu.vector_write_mask, 0xF);
|
||||||
|
}
|
||||||
|
}
|
||||||
173
crates/xenia-gpu/src/ucode/control_flow.rs
Normal file
173
crates/xenia-gpu/src/ucode/control_flow.rs
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
//! Xenos control-flow clause decoder.
|
||||||
|
//!
|
||||||
|
//! A shader's CF block is a sequence of 48-bit clauses packed two-per-
|
||||||
|
//! three-dword row. Each clause encodes an opcode and type-specific fields
|
||||||
|
//! (exec addr/count, loop start/end, branch target, etc.).
|
||||||
|
//!
|
||||||
|
//! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset
|
||||||
|
//! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`,
|
||||||
|
//! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the
|
||||||
|
//! translator can log + degrade.
|
||||||
|
|
||||||
|
/// Parsed representation of one CF clause.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum ControlFlowInstruction {
|
||||||
|
/// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions.
|
||||||
|
Exec {
|
||||||
|
/// Instruction-block dword index where this clause's instructions start,
|
||||||
|
/// expressed in **triple units** (each inst = 3 dwords).
|
||||||
|
address: u32,
|
||||||
|
/// Number of triples to execute.
|
||||||
|
count: u32,
|
||||||
|
/// The ALU-vs-fetch sequence bitmap (2 bits per instruction).
|
||||||
|
sequence: u32,
|
||||||
|
/// True when this clause ends the shader.
|
||||||
|
is_end: bool,
|
||||||
|
/// True if predicated; skip when predicate != predicate_condition.
|
||||||
|
predicated: bool,
|
||||||
|
predicate_condition: bool,
|
||||||
|
},
|
||||||
|
/// `kLoopStart` — begin a `aL` loop referencing a loop constant.
|
||||||
|
LoopStart { address: u32, loop_id: u32 },
|
||||||
|
/// `kLoopEnd` — close the loop; `address` points at the matching start.
|
||||||
|
LoopEnd { address: u32, loop_id: u32 },
|
||||||
|
/// `kCondJmp` — conditional jump to another CF index.
|
||||||
|
CondJmp {
|
||||||
|
target: u32,
|
||||||
|
predicated: bool,
|
||||||
|
predicate_condition: bool,
|
||||||
|
},
|
||||||
|
/// `kCondCall` — call into another CF subroutine.
|
||||||
|
CondCall { target: u32 },
|
||||||
|
/// `kReturn` — return from subroutine.
|
||||||
|
Return,
|
||||||
|
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
||||||
|
Alloc { size: u32, kind: AllocKind },
|
||||||
|
/// Exit the shader (terminal).
|
||||||
|
Exit,
|
||||||
|
/// Unknown / unhandled opcode.
|
||||||
|
Unknown { opcode: u8 },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Export target types for `kAlloc` clauses.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum AllocKind {
|
||||||
|
Position,
|
||||||
|
Interpolators,
|
||||||
|
Colors,
|
||||||
|
Memexport,
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AllocKind {
|
||||||
|
fn from_bits(b: u32) -> Self {
|
||||||
|
match b & 0x7 {
|
||||||
|
0 => AllocKind::Position,
|
||||||
|
1 => AllocKind::Interpolators,
|
||||||
|
2 => AllocKind::Colors,
|
||||||
|
3 => AllocKind::Memexport,
|
||||||
|
_ => AllocKind::Other,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode one row (three consecutive CF dwords) into two CF clauses.
|
||||||
|
///
|
||||||
|
/// Word layout per canary (`ucode.h:218-256`):
|
||||||
|
/// - word0 + lo16(word1) → CF_A's 48-bit payload
|
||||||
|
/// - hi16(word1) + word2 → CF_B's 48-bit payload
|
||||||
|
///
|
||||||
|
/// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47).
|
||||||
|
pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) {
|
||||||
|
// Build each 48-bit value as u64; LE within the clause.
|
||||||
|
let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32);
|
||||||
|
let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16);
|
||||||
|
(decode_single(a), decode_single(b))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||||
|
// Top 4 bits of the 48-bit payload.
|
||||||
|
let opcode = ((payload >> 44) & 0xF) as u8;
|
||||||
|
// Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
|
||||||
|
// extraction — good enough for the interpreter, which logs unknowns.
|
||||||
|
let predicated = ((payload >> 28) & 1) != 0;
|
||||||
|
let predicate_condition = ((payload >> 29) & 1) != 0;
|
||||||
|
|
||||||
|
match opcode {
|
||||||
|
0 => ControlFlowInstruction::Exec {
|
||||||
|
address: (payload & 0xFFF) as u32,
|
||||||
|
count: ((payload >> 12) & 0x7) as u32,
|
||||||
|
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||||
|
is_end: false,
|
||||||
|
predicated,
|
||||||
|
predicate_condition,
|
||||||
|
},
|
||||||
|
1 => ControlFlowInstruction::Exit,
|
||||||
|
2 => ControlFlowInstruction::Exec {
|
||||||
|
address: (payload & 0xFFF) as u32,
|
||||||
|
count: ((payload >> 12) & 0x7) as u32,
|
||||||
|
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||||
|
is_end: true,
|
||||||
|
predicated,
|
||||||
|
predicate_condition,
|
||||||
|
},
|
||||||
|
6 => ControlFlowInstruction::LoopStart {
|
||||||
|
address: (payload & 0x3FF) as u32,
|
||||||
|
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||||
|
},
|
||||||
|
7 => ControlFlowInstruction::LoopEnd {
|
||||||
|
address: (payload & 0x3FF) as u32,
|
||||||
|
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||||
|
},
|
||||||
|
8 => ControlFlowInstruction::CondCall {
|
||||||
|
target: (payload & 0x3FF) as u32,
|
||||||
|
},
|
||||||
|
9 => ControlFlowInstruction::Return,
|
||||||
|
10 => ControlFlowInstruction::CondJmp {
|
||||||
|
target: (payload & 0x3FF) as u32,
|
||||||
|
predicated,
|
||||||
|
predicate_condition,
|
||||||
|
},
|
||||||
|
12 => ControlFlowInstruction::Alloc {
|
||||||
|
size: (payload & 0x7) as u32,
|
||||||
|
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
||||||
|
},
|
||||||
|
other => ControlFlowInstruction::Unknown { opcode: other },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn opcode_exit_decodes() {
|
||||||
|
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
|
||||||
|
let payload: u64 = 1u64 << 44;
|
||||||
|
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||||
|
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||||
|
assert_eq!(cf, ControlFlowInstruction::Exit);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn opcode_exec_end_carries_address_count() {
|
||||||
|
// opcode 2 (ExecEnd), address=4, count=2, sequence=0.
|
||||||
|
let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4;
|
||||||
|
let hi = (payload & 0xFFFF_FFFF) as u32;
|
||||||
|
let lo = ((payload >> 32) & 0xFFFF) as u32;
|
||||||
|
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||||
|
match cf {
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address,
|
||||||
|
count,
|
||||||
|
is_end,
|
||||||
|
..
|
||||||
|
} => {
|
||||||
|
assert_eq!(address, 4);
|
||||||
|
assert_eq!(count, 2);
|
||||||
|
assert!(is_end);
|
||||||
|
}
|
||||||
|
other => panic!("expected Exec, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
117
crates/xenia-gpu/src/ucode/fetch.rs
Normal file
117
crates/xenia-gpu/src/ucode/fetch.rs
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
//! Xenos fetch (vertex + texture) instruction decoder.
|
||||||
|
//!
|
||||||
|
//! Like ALU instructions, fetches are 96 bits (3 dwords). The opcode lives
|
||||||
|
//! in the low 5 bits of word0. We split them into `VertexFetch` and
|
||||||
|
//! `TextureFetch` structurally because their operand layouts differ.
|
||||||
|
//!
|
||||||
|
//! Reference: `xenia-canary/src/xenia/gpu/ucode.h:690-877`.
|
||||||
|
|
||||||
|
/// Decoded fetch instruction.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum FetchInstruction {
|
||||||
|
Vertex(VertexFetch),
|
||||||
|
Texture(TextureFetch),
|
||||||
|
/// Unknown / minor variants we don't model yet.
|
||||||
|
Unknown { opcode: u8, raw: [u32; 3] },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct VertexFetch {
|
||||||
|
/// Vertex fetch constant index (0..=95).
|
||||||
|
pub fetch_const: u8,
|
||||||
|
/// Source register index (vertex index in r#).
|
||||||
|
pub src_register: u8,
|
||||||
|
/// Destination register for the fetched value.
|
||||||
|
pub dest_register: u8,
|
||||||
|
/// 4-bit write mask.
|
||||||
|
pub dest_write_mask: u8,
|
||||||
|
pub raw: [u32; 3],
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct TextureFetch {
|
||||||
|
/// Texture fetch constant index (0..=31).
|
||||||
|
pub fetch_const: u8,
|
||||||
|
pub src_register: u8,
|
||||||
|
pub dest_register: u8,
|
||||||
|
pub dest_write_mask: u8,
|
||||||
|
/// Dimension: 0=1D, 1=2D, 2=3D/stacked, 3=cube.
|
||||||
|
pub dimension: u8,
|
||||||
|
pub raw: [u32; 3],
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Opcodes (low 5 bits of word0). From `ucode.h`.
|
||||||
|
pub mod op {
|
||||||
|
pub const VERTEX_FETCH: u8 = 0x00;
|
||||||
|
pub const TEXTURE_FETCH: u8 = 0x01;
|
||||||
|
pub const GET_TEXTURE_BORDER_COLOR_FRAC: u8 = 0x16;
|
||||||
|
pub const GET_TEXTURE_COMPUTED_LOD: u8 = 0x17;
|
||||||
|
pub const GET_TEXTURE_WEIGHTS: u8 = 0x18;
|
||||||
|
pub const GET_TEXTURE_GRADIENTS: u8 = 0x19;
|
||||||
|
pub const SET_TEXTURE_LOD: u8 = 0x1A;
|
||||||
|
pub const SET_TEXTURE_GRADIENTS_HORZ: u8 = 0x1B;
|
||||||
|
pub const SET_TEXTURE_GRADIENTS_VERT: u8 = 0x1C;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||||
|
let w0 = words[0];
|
||||||
|
let w1 = words[1];
|
||||||
|
let opcode = (w0 & 0x1F) as u8;
|
||||||
|
match opcode {
|
||||||
|
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
||||||
|
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||||
|
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||||
|
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||||
|
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||||
|
raw: words,
|
||||||
|
}),
|
||||||
|
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||||
|
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||||
|
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||||
|
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||||
|
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||||
|
dimension: ((w1 >> 29) & 0x3) as u8,
|
||||||
|
raw: words,
|
||||||
|
}),
|
||||||
|
_ => FetchInstruction::Unknown { opcode, raw: words },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn decode_vertex_fetch() {
|
||||||
|
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
|
||||||
|
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
|
||||||
|
let v = decode_fetch([w0, 0, 0]);
|
||||||
|
match v {
|
||||||
|
FetchInstruction::Vertex(vf) => {
|
||||||
|
assert_eq!(vf.fetch_const, 5);
|
||||||
|
assert_eq!(vf.src_register, 2);
|
||||||
|
assert_eq!(vf.dest_register, 7);
|
||||||
|
}
|
||||||
|
other => panic!("expected Vertex, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn decode_texture_fetch() {
|
||||||
|
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
|
||||||
|
let t = decode_fetch([w0, (2u32 << 29), 0]);
|
||||||
|
match t {
|
||||||
|
FetchInstruction::Texture(tf) => {
|
||||||
|
assert_eq!(tf.fetch_const, 3);
|
||||||
|
assert_eq!(tf.dimension, 2);
|
||||||
|
}
|
||||||
|
other => panic!("expected Texture, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unknown_opcode_is_classified() {
|
||||||
|
let v = decode_fetch([0x16, 0, 0]); // GET_TEXTURE_BORDER_COLOR_FRAC
|
||||||
|
assert!(matches!(v, FetchInstruction::Unknown { opcode: 0x16, .. }));
|
||||||
|
}
|
||||||
|
}
|
||||||
249
crates/xenia-gpu/src/ucode/mod.rs
Normal file
249
crates/xenia-gpu/src/ucode/mod.rs
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
//! Xenos (ATI R500-family) shader microcode decoder.
|
||||||
|
//!
|
||||||
|
//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
|
||||||
|
//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
|
||||||
|
//! instructions (vector + scalar pipes), and fetch instructions (vertex +
|
||||||
|
//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
|
||||||
|
//! translator comes online in P7, it reuses the same parser.
|
||||||
|
//!
|
||||||
|
//! ## Binary layout
|
||||||
|
//!
|
||||||
|
//! A compiled shader has two sections back-to-back:
|
||||||
|
//!
|
||||||
|
//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
|
||||||
|
//! two clauses into three 32-bit words:
|
||||||
|
//! ```text
|
||||||
|
//! word0 word1 word2
|
||||||
|
//! [-CF_A (48)-][-CF_B (48)-]
|
||||||
|
//! ```
|
||||||
|
//! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
|
||||||
|
//! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
|
||||||
|
//!
|
||||||
|
//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
|
||||||
|
//! instructions. Each control-flow clause of kind `Exec*` references a
|
||||||
|
//! contiguous range of these by `(address, count)` in dwords * 3.
|
||||||
|
//!
|
||||||
|
//! We read big-endian dwords straight out of guest memory (the `raw`
|
||||||
|
//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
|
||||||
|
//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
|
||||||
|
//! layout and `:700-877` for the fetch/ALU mix.
|
||||||
|
|
||||||
|
pub mod alu;
|
||||||
|
pub mod control_flow;
|
||||||
|
pub mod fetch;
|
||||||
|
|
||||||
|
use self::alu::AluInstruction;
|
||||||
|
use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
|
||||||
|
use self::fetch::FetchInstruction;
|
||||||
|
|
||||||
|
/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
|
||||||
|
/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
|
||||||
|
pub mod cf_kind {
|
||||||
|
pub const EXEC: u32 = 0;
|
||||||
|
pub const EXEC_END: u32 = 1;
|
||||||
|
pub const ALLOC: u32 = 2;
|
||||||
|
pub const EXIT: u32 = 3;
|
||||||
|
pub const LOOP_START: u32 = 4;
|
||||||
|
pub const LOOP_END: u32 = 5;
|
||||||
|
pub const COND_JMP: u32 = 6;
|
||||||
|
pub const COND_CALL: u32 = 7;
|
||||||
|
pub const RETURN: u32 = 8;
|
||||||
|
pub const UNKNOWN: u32 = 15;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
|
||||||
|
pub mod cf_alloc_kind {
|
||||||
|
pub const POSITION: u32 = 0;
|
||||||
|
pub const INTERPOLATORS: u32 = 1;
|
||||||
|
pub const COLORS: u32 = 2;
|
||||||
|
pub const MEMEXPORT: u32 = 3;
|
||||||
|
pub const OTHER: u32 = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
|
||||||
|
/// interpreter expects:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// [0] cf_count
|
||||||
|
/// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause
|
||||||
|
/// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch)
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// The CF table lets WGSL walk clauses without reconstructing bit-packed
|
||||||
|
/// layouts on the GPU. Semantics per `kind`:
|
||||||
|
///
|
||||||
|
/// | kind | primary | aux |
|
||||||
|
/// |-------------|----------------------------|------------------------------|
|
||||||
|
/// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count |
|
||||||
|
/// | ALLOC | alloc_kind (see cf_alloc_kind) | size |
|
||||||
|
/// | EXIT | 0 | 0 |
|
||||||
|
/// | LOOP_START | address | loop_id |
|
||||||
|
/// | LOOP_END | address | loop_id |
|
||||||
|
/// | COND_JMP | target | predicate flags |
|
||||||
|
/// | COND_CALL | target | 0 |
|
||||||
|
/// | RETURN | 0 | 0 |
|
||||||
|
/// | UNKNOWN | opcode | 0 |
|
||||||
|
pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
|
||||||
|
let cf_count = parsed.cf.len() as u32;
|
||||||
|
let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
|
||||||
|
out.push(cf_count);
|
||||||
|
for clause in &parsed.cf {
|
||||||
|
let (kind, primary, aux) = encode_cf(*clause);
|
||||||
|
out.push(kind);
|
||||||
|
out.push(primary);
|
||||||
|
out.push(aux);
|
||||||
|
}
|
||||||
|
out.extend_from_slice(&parsed.instructions);
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
||||||
|
use ControlFlowInstruction::*;
|
||||||
|
match c {
|
||||||
|
Exec {
|
||||||
|
address,
|
||||||
|
count,
|
||||||
|
sequence,
|
||||||
|
is_end,
|
||||||
|
predicated,
|
||||||
|
predicate_condition,
|
||||||
|
} => {
|
||||||
|
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
||||||
|
let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
|
||||||
|
| (pred_bits << 8);
|
||||||
|
(kind, address, (sequence << 8) | count)
|
||||||
|
}
|
||||||
|
Alloc { size, kind } => {
|
||||||
|
let akind = match kind {
|
||||||
|
AllocKind::Position => cf_alloc_kind::POSITION,
|
||||||
|
AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
|
||||||
|
AllocKind::Colors => cf_alloc_kind::COLORS,
|
||||||
|
AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
|
||||||
|
AllocKind::Other => cf_alloc_kind::OTHER,
|
||||||
|
};
|
||||||
|
(cf_kind::ALLOC, akind, size)
|
||||||
|
}
|
||||||
|
Exit => (cf_kind::EXIT, 0, 0),
|
||||||
|
LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
|
||||||
|
LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
|
||||||
|
CondJmp {
|
||||||
|
target,
|
||||||
|
predicated,
|
||||||
|
predicate_condition,
|
||||||
|
} => {
|
||||||
|
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
||||||
|
(cf_kind::COND_JMP, target, pred_bits)
|
||||||
|
}
|
||||||
|
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
||||||
|
Return => (cf_kind::RETURN, 0, 0),
|
||||||
|
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// One instruction word set from the instruction-block section. Xenos packs
|
||||||
|
/// ALU and fetch instructions identically (96 bits each); the owning exec
|
||||||
|
/// clause's "sequence" bitmap decides which is which.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum DecodedInstruction {
|
||||||
|
/// ALU pipe (vector ALU + optional co-issued scalar ALU).
|
||||||
|
Alu(AluInstruction),
|
||||||
|
/// Vertex or texture fetch.
|
||||||
|
Fetch(FetchInstruction),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parsed shader: the control-flow clause list + the raw 32-bit instruction
|
||||||
|
/// words. The uber-shader / translator is expected to index into
|
||||||
|
/// `instructions` based on `(clause.address * 3, clause.count * 3)`.
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct ParsedShader {
|
||||||
|
pub cf: Vec<ControlFlowInstruction>,
|
||||||
|
/// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
|
||||||
|
/// instruction; the owning `Exec` clause's `sequence` bitmap picks the
|
||||||
|
/// kind.
|
||||||
|
pub instructions: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
||||||
|
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
||||||
|
/// is encoded in the first word's low 12 bits of the last exec clause —
|
||||||
|
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
||||||
|
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||||
|
let mut cf = Vec::new();
|
||||||
|
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
||||||
|
// layout). Walk pairs of 3 dwords per pair of clauses.
|
||||||
|
let mut i = 0usize;
|
||||||
|
while i + 2 < raw_dwords.len() {
|
||||||
|
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
||||||
|
let (first, second) = a;
|
||||||
|
let seen_exit = matches!(
|
||||||
|
first,
|
||||||
|
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||||
|
) || matches!(
|
||||||
|
second,
|
||||||
|
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||||
|
);
|
||||||
|
cf.push(first);
|
||||||
|
cf.push(second);
|
||||||
|
i += 3;
|
||||||
|
if seen_exit {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Everything after `i` dwords is the instruction block.
|
||||||
|
let instructions = raw_dwords[i..].to_vec();
|
||||||
|
ParsedShader { cf, instructions }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_blob_parses_empty() {
|
||||||
|
let p = parse_shader(&[]);
|
||||||
|
assert!(p.cf.is_empty());
|
||||||
|
assert!(p.instructions.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pack_for_wgsl_layout_is_correct() {
|
||||||
|
// Build a tiny ParsedShader by hand and verify the packed form.
|
||||||
|
let parsed = ParsedShader {
|
||||||
|
cf: vec![
|
||||||
|
ControlFlowInstruction::Exec {
|
||||||
|
address: 0x10,
|
||||||
|
count: 3,
|
||||||
|
sequence: 0b1010,
|
||||||
|
is_end: false,
|
||||||
|
predicated: false,
|
||||||
|
predicate_condition: false,
|
||||||
|
},
|
||||||
|
ControlFlowInstruction::Exit,
|
||||||
|
],
|
||||||
|
instructions: vec![0x1111, 0x2222, 0x3333],
|
||||||
|
};
|
||||||
|
let packed = pack_for_wgsl(&parsed);
|
||||||
|
assert_eq!(packed[0], 2, "cf_count");
|
||||||
|
// First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
|
||||||
|
assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
|
||||||
|
assert_eq!(packed[2], 0x10);
|
||||||
|
assert_eq!(packed[3], (0b1010 << 8) | 3);
|
||||||
|
// Second clause: EXIT
|
||||||
|
assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
|
||||||
|
// Instruction block starts at 1 + 2*3 = 7
|
||||||
|
assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn trivial_exit_clause_stops_parsing() {
|
||||||
|
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
||||||
|
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
||||||
|
let w0 = 0u32; // clause A body
|
||||||
|
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
||||||
|
let w2 = 0u32;
|
||||||
|
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
||||||
|
assert!(!p.cf.is_empty());
|
||||||
|
// Exit detected → remaining dword is instruction data.
|
||||||
|
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
||||||
|
}
|
||||||
|
}
|
||||||
124
crates/xenia-gpu/src/xenos_constants.rs
Normal file
124
crates/xenia-gpu/src/xenos_constants.rs
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
//! The "Xenos constants" block the WGSL interpreter consumes per draw.
|
||||||
|
//!
|
||||||
|
//! Mirrors the Xenos register-file regions that carry the per-draw constant
|
||||||
|
//! values shaders reference at runtime:
|
||||||
|
//!
|
||||||
|
//! | Region | Base | Count | Size |
|
||||||
|
//! |--------|------|-------|------|
|
||||||
|
//! | ALU | 0x4000 | 512 × vec4<f32> | 8 KB |
|
||||||
|
//! | Fetch | 0x4800 | 256 × u32 | 1 KB |
|
||||||
|
//! | Bool | 0x4900 | 8 × u32 | 32 B |
|
||||||
|
//! | Loop | 0x4908 | 32 × u32 | 128 B |
|
||||||
|
//!
|
||||||
|
//! Total: ~9.2 KB, well under the 64 KB min uniform buffer size on all wgpu
|
||||||
|
//! backends. The `XenosConstantsBlock` is declared `#[repr(C)]` + bytemuck
|
||||||
|
//! `Pod` so it can be `bytemuck::bytes_of()`'d directly into a wgpu uniform
|
||||||
|
//! buffer. The matching WGSL `struct XenosConstants` lives in
|
||||||
|
//! `shaders/xenos_interp.wgsl`.
|
||||||
|
|
||||||
|
use bytemuck::{Pod, Zeroable};
|
||||||
|
|
||||||
|
use crate::register_file::RegisterFile;
|
||||||
|
|
||||||
|
pub const ALU_CONSTANT_COUNT: usize = 512;
|
||||||
|
pub const FETCH_CONSTANT_COUNT: usize = 256;
|
||||||
|
pub const BOOL_CONSTANT_COUNT: usize = 8;
|
||||||
|
pub const LOOP_CONSTANT_COUNT: usize = 32;
|
||||||
|
|
||||||
|
pub const CONST_BASE_ALU: u32 = 0x4000;
|
||||||
|
pub const CONST_BASE_FETCH: u32 = 0x4800;
|
||||||
|
pub const CONST_BASE_BOOL: u32 = 0x4900;
|
||||||
|
pub const CONST_BASE_LOOP: u32 = 0x4908;
|
||||||
|
|
||||||
|
/// Per-draw constants block uploaded once to the uniform buffer at
|
||||||
|
/// `@group(0) @binding(1)`.
|
||||||
|
#[repr(C)]
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct XenosConstantsBlock {
|
||||||
|
pub alu: [[f32; 4]; ALU_CONSTANT_COUNT],
|
||||||
|
pub fetch: [u32; FETCH_CONSTANT_COUNT],
|
||||||
|
pub bool_consts: [u32; BOOL_CONSTANT_COUNT],
|
||||||
|
pub loop_consts: [u32; LOOP_CONSTANT_COUNT],
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFETY: all fields are Pod arrays of Pod primitives; `#[repr(C)]` fixes
|
||||||
|
// the layout. `bytemuck` derives `Pod` only when alignment + padding line
|
||||||
|
// up, so manual `unsafe impl` is the right tool here.
|
||||||
|
unsafe impl Zeroable for XenosConstantsBlock {}
|
||||||
|
unsafe impl Pod for XenosConstantsBlock {}
|
||||||
|
|
||||||
|
impl Default for XenosConstantsBlock {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
alu: [[0.0; 4]; ALU_CONSTANT_COUNT],
|
||||||
|
fetch: [0; FETCH_CONSTANT_COUNT],
|
||||||
|
bool_consts: [0; BOOL_CONSTANT_COUNT],
|
||||||
|
loop_consts: [0; LOOP_CONSTANT_COUNT],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl XenosConstantsBlock {
|
||||||
|
/// Size in bytes — exposed for tests + wgpu buffer sizing.
|
||||||
|
pub const SIZE: usize = std::mem::size_of::<Self>();
|
||||||
|
|
||||||
|
/// Snapshot the constants from a Xenos `RegisterFile` into a dense,
|
||||||
|
/// host-friendly layout the WGSL interpreter expects. ALU constants
|
||||||
|
/// (vec4 each) are 4 consecutive registers; fetch constants are u32.
|
||||||
|
pub fn snapshot(rf: &RegisterFile) -> Self {
|
||||||
|
let mut out = Self::default();
|
||||||
|
for i in 0..ALU_CONSTANT_COUNT {
|
||||||
|
let base = CONST_BASE_ALU + (i as u32) * 4;
|
||||||
|
out.alu[i] = [
|
||||||
|
f32::from_bits(rf.read(base)),
|
||||||
|
f32::from_bits(rf.read(base + 1)),
|
||||||
|
f32::from_bits(rf.read(base + 2)),
|
||||||
|
f32::from_bits(rf.read(base + 3)),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
for i in 0..FETCH_CONSTANT_COUNT {
|
||||||
|
out.fetch[i] = rf.read(CONST_BASE_FETCH + i as u32);
|
||||||
|
}
|
||||||
|
for i in 0..BOOL_CONSTANT_COUNT {
|
||||||
|
out.bool_consts[i] = rf.read(CONST_BASE_BOOL + i as u32);
|
||||||
|
}
|
||||||
|
for i in 0..LOOP_CONSTANT_COUNT {
|
||||||
|
out.loop_consts[i] = rf.read(CONST_BASE_LOOP + i as u32);
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Layout-sanity: total size is (512·16) + (256·4) + (8·4) + (32·4) =
|
||||||
|
/// 8192 + 1024 + 32 + 128 = 9376 bytes. If this number drifts, either
|
||||||
|
/// the constant counts changed or the compiler added padding; either
|
||||||
|
/// way we want to know at test time because the WGSL struct layout in
|
||||||
|
/// `xenos_interp.wgsl` depends on it.
|
||||||
|
#[test]
|
||||||
|
fn xenos_constants_block_size_is_stable() {
|
||||||
|
assert_eq!(XenosConstantsBlock::SIZE, 9376);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn snapshot_roundtrip_from_register_file() {
|
||||||
|
let mut rf = RegisterFile::new();
|
||||||
|
// Write a recognisable pattern to alu[0] = (1.0, 2.0, 3.0, 4.0)
|
||||||
|
rf.write(CONST_BASE_ALU + 0, f32::to_bits(1.0));
|
||||||
|
rf.write(CONST_BASE_ALU + 1, f32::to_bits(2.0));
|
||||||
|
rf.write(CONST_BASE_ALU + 2, f32::to_bits(3.0));
|
||||||
|
rf.write(CONST_BASE_ALU + 3, f32::to_bits(4.0));
|
||||||
|
rf.write(CONST_BASE_FETCH + 5, 0xDEAD_BEEF);
|
||||||
|
rf.write(CONST_BASE_BOOL, 0x1234);
|
||||||
|
rf.write(CONST_BASE_LOOP + 3, 0x5678);
|
||||||
|
|
||||||
|
let snap = XenosConstantsBlock::snapshot(&rf);
|
||||||
|
assert_eq!(snap.alu[0], [1.0, 2.0, 3.0, 4.0]);
|
||||||
|
assert_eq!(snap.fetch[5], 0xDEAD_BEEF);
|
||||||
|
assert_eq!(snap.bool_consts[0], 0x1234);
|
||||||
|
assert_eq!(snap.loop_consts[3], 0x5678);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user