First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).
Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.
Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
250 lines
9.6 KiB
Rust
250 lines
9.6 KiB
Rust
//! Xenos (ATI R500-family) shader microcode decoder.
|
|
//!
|
|
//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
|
|
//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
|
|
//! instructions (vector + scalar pipes), and fetch instructions (vertex +
|
|
//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
|
|
//! translator comes online in P7, it reuses the same parser.
|
|
//!
|
|
//! ## Binary layout
|
|
//!
|
|
//! A compiled shader has two sections back-to-back:
|
|
//!
|
|
//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
|
|
//! two clauses into three 32-bit words:
|
|
//! ```text
|
|
//! word0 word1 word2
|
|
//! [-CF_A (48)-][-CF_B (48)-]
|
|
//! ```
|
|
//! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
|
|
//! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
|
|
//!
|
|
//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
|
|
//! instructions. Each control-flow clause of kind `Exec*` references a
|
|
//! contiguous range of these by `(address, count)` in dwords * 3.
|
|
//!
|
|
//! We read big-endian dwords straight out of guest memory (the `raw`
|
|
//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
|
|
//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
|
|
//! layout and `:700-877` for the fetch/ALU mix.
|
|
|
|
pub mod alu;
|
|
pub mod control_flow;
|
|
pub mod fetch;
|
|
|
|
use self::alu::AluInstruction;
|
|
use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
|
|
use self::fetch::FetchInstruction;
|
|
|
|
/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
|
|
/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
|
|
pub mod cf_kind {
|
|
pub const EXEC: u32 = 0;
|
|
pub const EXEC_END: u32 = 1;
|
|
pub const ALLOC: u32 = 2;
|
|
pub const EXIT: u32 = 3;
|
|
pub const LOOP_START: u32 = 4;
|
|
pub const LOOP_END: u32 = 5;
|
|
pub const COND_JMP: u32 = 6;
|
|
pub const COND_CALL: u32 = 7;
|
|
pub const RETURN: u32 = 8;
|
|
pub const UNKNOWN: u32 = 15;
|
|
}
|
|
|
|
/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
|
|
pub mod cf_alloc_kind {
|
|
pub const POSITION: u32 = 0;
|
|
pub const INTERPOLATORS: u32 = 1;
|
|
pub const COLORS: u32 = 2;
|
|
pub const MEMEXPORT: u32 = 3;
|
|
pub const OTHER: u32 = 4;
|
|
}
|
|
|
|
/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
|
|
/// interpreter expects:
|
|
///
|
|
/// ```text
|
|
/// [0] cf_count
|
|
/// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause
|
|
/// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch)
|
|
/// ```
|
|
///
|
|
/// The CF table lets WGSL walk clauses without reconstructing bit-packed
|
|
/// layouts on the GPU. Semantics per `kind`:
|
|
///
|
|
/// | kind | primary | aux |
|
|
/// |-------------|----------------------------|------------------------------|
|
|
/// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count |
|
|
/// | ALLOC | alloc_kind (see cf_alloc_kind) | size |
|
|
/// | EXIT | 0 | 0 |
|
|
/// | LOOP_START | address | loop_id |
|
|
/// | LOOP_END | address | loop_id |
|
|
/// | COND_JMP | target | predicate flags |
|
|
/// | COND_CALL | target | 0 |
|
|
/// | RETURN | 0 | 0 |
|
|
/// | UNKNOWN | opcode | 0 |
|
|
pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
|
|
let cf_count = parsed.cf.len() as u32;
|
|
let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
|
|
out.push(cf_count);
|
|
for clause in &parsed.cf {
|
|
let (kind, primary, aux) = encode_cf(*clause);
|
|
out.push(kind);
|
|
out.push(primary);
|
|
out.push(aux);
|
|
}
|
|
out.extend_from_slice(&parsed.instructions);
|
|
out
|
|
}
|
|
|
|
fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
|
use ControlFlowInstruction::*;
|
|
match c {
|
|
Exec {
|
|
address,
|
|
count,
|
|
sequence,
|
|
is_end,
|
|
predicated,
|
|
predicate_condition,
|
|
} => {
|
|
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
|
let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
|
|
| (pred_bits << 8);
|
|
(kind, address, (sequence << 8) | count)
|
|
}
|
|
Alloc { size, kind } => {
|
|
let akind = match kind {
|
|
AllocKind::Position => cf_alloc_kind::POSITION,
|
|
AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
|
|
AllocKind::Colors => cf_alloc_kind::COLORS,
|
|
AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
|
|
AllocKind::Other => cf_alloc_kind::OTHER,
|
|
};
|
|
(cf_kind::ALLOC, akind, size)
|
|
}
|
|
Exit => (cf_kind::EXIT, 0, 0),
|
|
LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
|
|
LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
|
|
CondJmp {
|
|
target,
|
|
predicated,
|
|
predicate_condition,
|
|
} => {
|
|
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
|
(cf_kind::COND_JMP, target, pred_bits)
|
|
}
|
|
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
|
Return => (cf_kind::RETURN, 0, 0),
|
|
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
|
}
|
|
}
|
|
|
|
/// One instruction word set from the instruction-block section. Xenos packs
|
|
/// ALU and fetch instructions identically (96 bits each); the owning exec
|
|
/// clause's "sequence" bitmap decides which is which.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum DecodedInstruction {
|
|
/// ALU pipe (vector ALU + optional co-issued scalar ALU).
|
|
Alu(AluInstruction),
|
|
/// Vertex or texture fetch.
|
|
Fetch(FetchInstruction),
|
|
}
|
|
|
|
/// Parsed shader: the control-flow clause list + the raw 32-bit instruction
|
|
/// words. The uber-shader / translator is expected to index into
|
|
/// `instructions` based on `(clause.address * 3, clause.count * 3)`.
|
|
#[derive(Debug, Clone, Default)]
|
|
pub struct ParsedShader {
|
|
pub cf: Vec<ControlFlowInstruction>,
|
|
/// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
|
|
/// instruction; the owning `Exec` clause's `sequence` bitmap picks the
|
|
/// kind.
|
|
pub instructions: Vec<u32>,
|
|
}
|
|
|
|
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
|
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
|
/// is encoded in the first word's low 12 bits of the last exec clause —
|
|
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
|
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
|
let mut cf = Vec::new();
|
|
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
|
// layout). Walk pairs of 3 dwords per pair of clauses.
|
|
let mut i = 0usize;
|
|
while i + 2 < raw_dwords.len() {
|
|
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
|
let (first, second) = a;
|
|
let seen_exit = matches!(
|
|
first,
|
|
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
|
) || matches!(
|
|
second,
|
|
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
|
);
|
|
cf.push(first);
|
|
cf.push(second);
|
|
i += 3;
|
|
if seen_exit {
|
|
break;
|
|
}
|
|
}
|
|
// Everything after `i` dwords is the instruction block.
|
|
let instructions = raw_dwords[i..].to_vec();
|
|
ParsedShader { cf, instructions }
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn empty_blob_parses_empty() {
|
|
let p = parse_shader(&[]);
|
|
assert!(p.cf.is_empty());
|
|
assert!(p.instructions.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn pack_for_wgsl_layout_is_correct() {
|
|
// Build a tiny ParsedShader by hand and verify the packed form.
|
|
let parsed = ParsedShader {
|
|
cf: vec![
|
|
ControlFlowInstruction::Exec {
|
|
address: 0x10,
|
|
count: 3,
|
|
sequence: 0b1010,
|
|
is_end: false,
|
|
predicated: false,
|
|
predicate_condition: false,
|
|
},
|
|
ControlFlowInstruction::Exit,
|
|
],
|
|
instructions: vec![0x1111, 0x2222, 0x3333],
|
|
};
|
|
let packed = pack_for_wgsl(&parsed);
|
|
assert_eq!(packed[0], 2, "cf_count");
|
|
// First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
|
|
assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
|
|
assert_eq!(packed[2], 0x10);
|
|
assert_eq!(packed[3], (0b1010 << 8) | 3);
|
|
// Second clause: EXIT
|
|
assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
|
|
// Instruction block starts at 1 + 2*3 = 7
|
|
assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
|
|
}
|
|
|
|
#[test]
|
|
fn trivial_exit_clause_stops_parsing() {
|
|
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
|
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
|
let w0 = 0u32; // clause A body
|
|
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
|
let w2 = 0u32;
|
|
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
|
assert!(!p.cf.is_empty());
|
|
// Exit detected → remaining dword is instruction data.
|
|
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
|
}
|
|
}
|