xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).

Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.

Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions

View File

@@ -0,0 +1,249 @@
//! Xenos (ATI R500-family) shader microcode decoder.
//!
//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
//! instructions (vector + scalar pipes), and fetch instructions (vertex +
//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
//! translator comes online in P7, it reuses the same parser.
//!
//! ## Binary layout
//!
//! A compiled shader has two sections back-to-back:
//!
//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
//! two clauses into three 32-bit words:
//! ```text
//! word0 word1 word2
//! [-CF_A (48)-][-CF_B (48)-]
//! ```
//! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
//! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
//!
//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
//! instructions. Each control-flow clause of kind `Exec*` references a
//! contiguous range of these by `(address, count)` in dwords * 3.
//!
//! We read big-endian dwords straight out of guest memory (the `raw`
//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
//! layout and `:700-877` for the fetch/ALU mix.
pub mod alu;
pub mod control_flow;
pub mod fetch;
use self::alu::AluInstruction;
use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
use self::fetch::FetchInstruction;
/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
pub mod cf_kind {
pub const EXEC: u32 = 0;
pub const EXEC_END: u32 = 1;
pub const ALLOC: u32 = 2;
pub const EXIT: u32 = 3;
pub const LOOP_START: u32 = 4;
pub const LOOP_END: u32 = 5;
pub const COND_JMP: u32 = 6;
pub const COND_CALL: u32 = 7;
pub const RETURN: u32 = 8;
pub const UNKNOWN: u32 = 15;
}
/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
pub mod cf_alloc_kind {
pub const POSITION: u32 = 0;
pub const INTERPOLATORS: u32 = 1;
pub const COLORS: u32 = 2;
pub const MEMEXPORT: u32 = 3;
pub const OTHER: u32 = 4;
}
/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
/// interpreter expects:
///
/// ```text
/// [0] cf_count
/// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause
/// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch)
/// ```
///
/// The CF table lets WGSL walk clauses without reconstructing bit-packed
/// layouts on the GPU. Semantics per `kind`:
///
/// | kind | primary | aux |
/// |-------------|----------------------------|------------------------------|
/// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count |
/// | ALLOC | alloc_kind (see cf_alloc_kind) | size |
/// | EXIT | 0 | 0 |
/// | LOOP_START | address | loop_id |
/// | LOOP_END | address | loop_id |
/// | COND_JMP | target | predicate flags |
/// | COND_CALL | target | 0 |
/// | RETURN | 0 | 0 |
/// | UNKNOWN | opcode | 0 |
pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
let cf_count = parsed.cf.len() as u32;
let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
out.push(cf_count);
for clause in &parsed.cf {
let (kind, primary, aux) = encode_cf(*clause);
out.push(kind);
out.push(primary);
out.push(aux);
}
out.extend_from_slice(&parsed.instructions);
out
}
fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
use ControlFlowInstruction::*;
match c {
Exec {
address,
count,
sequence,
is_end,
predicated,
predicate_condition,
} => {
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
| (pred_bits << 8);
(kind, address, (sequence << 8) | count)
}
Alloc { size, kind } => {
let akind = match kind {
AllocKind::Position => cf_alloc_kind::POSITION,
AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
AllocKind::Colors => cf_alloc_kind::COLORS,
AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
AllocKind::Other => cf_alloc_kind::OTHER,
};
(cf_kind::ALLOC, akind, size)
}
Exit => (cf_kind::EXIT, 0, 0),
LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
CondJmp {
target,
predicated,
predicate_condition,
} => {
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
(cf_kind::COND_JMP, target, pred_bits)
}
CondCall { target } => (cf_kind::COND_CALL, target, 0),
Return => (cf_kind::RETURN, 0, 0),
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
}
}
/// One instruction word set from the instruction-block section. Xenos packs
/// ALU and fetch instructions identically (96 bits each); the owning exec
/// clause's "sequence" bitmap decides which is which.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DecodedInstruction {
/// ALU pipe (vector ALU + optional co-issued scalar ALU).
Alu(AluInstruction),
/// Vertex or texture fetch.
Fetch(FetchInstruction),
}
/// Parsed shader: the control-flow clause list + the raw 32-bit instruction
/// words. The uber-shader / translator is expected to index into
/// `instructions` based on `(clause.address * 3, clause.count * 3)`.
#[derive(Debug, Clone, Default)]
pub struct ParsedShader {
pub cf: Vec<ControlFlowInstruction>,
/// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
/// instruction; the owning `Exec` clause's `sequence` bitmap picks the
/// kind.
pub instructions: Vec<u32>,
}
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
/// is encoded in the first word's low 12 bits of the last exec clause —
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
let mut cf = Vec::new();
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
// layout). Walk pairs of 3 dwords per pair of clauses.
let mut i = 0usize;
while i + 2 < raw_dwords.len() {
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
let (first, second) = a;
let seen_exit = matches!(
first,
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
) || matches!(
second,
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
);
cf.push(first);
cf.push(second);
i += 3;
if seen_exit {
break;
}
}
// Everything after `i` dwords is the instruction block.
let instructions = raw_dwords[i..].to_vec();
ParsedShader { cf, instructions }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_blob_parses_empty() {
let p = parse_shader(&[]);
assert!(p.cf.is_empty());
assert!(p.instructions.is_empty());
}
#[test]
fn pack_for_wgsl_layout_is_correct() {
// Build a tiny ParsedShader by hand and verify the packed form.
let parsed = ParsedShader {
cf: vec![
ControlFlowInstruction::Exec {
address: 0x10,
count: 3,
sequence: 0b1010,
is_end: false,
predicated: false,
predicate_condition: false,
},
ControlFlowInstruction::Exit,
],
instructions: vec![0x1111, 0x2222, 0x3333],
};
let packed = pack_for_wgsl(&parsed);
assert_eq!(packed[0], 2, "cf_count");
// First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
assert_eq!(packed[2], 0x10);
assert_eq!(packed[3], (0b1010 << 8) | 3);
// Second clause: EXIT
assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
// Instruction block starts at 1 + 2*3 = 7
assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
}
#[test]
fn trivial_exit_clause_stops_parsing() {
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
let w0 = 0u32; // clause A body
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
let w2 = 0u32;
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
assert!(!p.cf.is_empty());
// Exit detected → remaining dword is instruction data.
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
}
}