xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)
First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).
Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.
Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
206
crates/xenia-gpu/src/ucode/alu.rs
Normal file
206
crates/xenia-gpu/src/ucode/alu.rs
Normal file
@@ -0,0 +1,206 @@
|
||||
//! Xenos ALU (vector + scalar) instruction decoder.
|
||||
//!
|
||||
//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
|
||||
//! - word0: operand modifier flags + destination info
|
||||
//! - word1: source register / swizzle fields
|
||||
//! - word2: opcode + write mask + export target
|
||||
//!
|
||||
//! See `ucode.h:900-1400` for the full field map. This decoder captures the
|
||||
//! minimal shape the uber-shader needs; flags we don't interpret yet are
|
||||
//! retained as raw bits in `raw` for downstream inspection.
|
||||
|
||||
/// Decoded ALU instruction.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct AluInstruction {
|
||||
/// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
|
||||
pub vector_opcode: u8,
|
||||
/// Scalar ALU opcode (bits 7..13 of word2).
|
||||
pub scalar_opcode: u8,
|
||||
/// Destination register index for vector result (7 bits).
|
||||
pub vector_dest: u8,
|
||||
/// Destination register index for scalar result (7 bits).
|
||||
pub scalar_dest: u8,
|
||||
/// 4-bit write mask for the vector result (x/y/z/w).
|
||||
pub vector_write_mask: u8,
|
||||
/// 4-bit write mask for the scalar result.
|
||||
pub scalar_write_mask: u8,
|
||||
/// Set when the instruction should write to the export bank (position,
|
||||
/// interpolators, color, etc.) instead of the general register file.
|
||||
pub vector_dest_is_export: bool,
|
||||
/// Selects `ps` (previous scalar result) as the scalar operand when set.
|
||||
pub scalar_src_is_ps: bool,
|
||||
/// Source register indices (at most 3 for vector ops).
|
||||
pub src_a: u8,
|
||||
pub src_b: u8,
|
||||
pub src_c: u8,
|
||||
/// Set when the instruction is predicated; skipped if the predicate
|
||||
/// doesn't match `predicate_condition`.
|
||||
pub predicated: bool,
|
||||
pub predicate_condition: bool,
|
||||
/// Raw dwords — preserved verbatim so the translator / interpreter can
|
||||
/// reach into fields we haven't parsed explicitly yet.
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
/// Decode a 3-dword ALU triple.
|
||||
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||
let w0 = words[0];
|
||||
let _w1 = words[1];
|
||||
let w2 = words[2];
|
||||
AluInstruction {
|
||||
vector_opcode: (w2 & 0x3F) as u8,
|
||||
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
|
||||
vector_dest: ((w2 >> 16) & 0x7F) as u8,
|
||||
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
|
||||
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
|
||||
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
|
||||
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
|
||||
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
|
||||
src_a: (w0 & 0xFF) as u8,
|
||||
src_b: ((w0 >> 8) & 0xFF) as u8,
|
||||
src_c: ((w0 >> 16) & 0xFF) as u8,
|
||||
predicated: ((w0 >> 27) & 1) != 0,
|
||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||
raw: words,
|
||||
}
|
||||
}
|
||||
|
||||
/// Vector ALU opcodes we reference by name. Values match canary's
|
||||
/// `AluVectorOpcode` enum in `ucode.h:1354`.
|
||||
pub mod vop {
|
||||
pub const ADD: u8 = 0;
|
||||
pub const MUL: u8 = 1;
|
||||
pub const MAX: u8 = 2;
|
||||
pub const MIN: u8 = 3;
|
||||
pub const SEQ: u8 = 4;
|
||||
pub const SGT: u8 = 5;
|
||||
pub const SGE: u8 = 6;
|
||||
pub const SNE: u8 = 7;
|
||||
pub const FRC: u8 = 8;
|
||||
pub const TRUNC: u8 = 9;
|
||||
pub const FLOOR: u8 = 10;
|
||||
pub const MAD: u8 = 11;
|
||||
pub const CND_EQ: u8 = 12;
|
||||
pub const CND_GE: u8 = 13;
|
||||
pub const CND_GT: u8 = 14;
|
||||
pub const DOT4: u8 = 15;
|
||||
pub const DOT3: u8 = 16;
|
||||
pub const DOT2_ADD: u8 = 17;
|
||||
pub const CUBE: u8 = 18;
|
||||
pub const MAX4: u8 = 19;
|
||||
pub const SETP_EQ_PUSH: u8 = 20;
|
||||
pub const SETP_NE_PUSH: u8 = 21;
|
||||
pub const SETP_GT_PUSH: u8 = 22;
|
||||
pub const SETP_GE_PUSH: u8 = 23;
|
||||
pub const KILL_EQ: u8 = 24;
|
||||
pub const KILL_GT: u8 = 25;
|
||||
pub const KILL_GE: u8 = 26;
|
||||
pub const KILL_NE: u8 = 27;
|
||||
pub const DST: u8 = 28;
|
||||
pub const MAX_A: u8 = 29;
|
||||
}
|
||||
|
||||
/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
|
||||
/// `ucode.h:1001`.
|
||||
pub mod sop {
|
||||
pub const ADDS: u8 = 0;
|
||||
pub const ADDS_PREV: u8 = 1;
|
||||
pub const MULS: u8 = 2;
|
||||
pub const MULS_PREV: u8 = 3;
|
||||
pub const MULS_PREV2: u8 = 4;
|
||||
pub const MAXS: u8 = 5;
|
||||
pub const MINS: u8 = 6;
|
||||
pub const SEQS: u8 = 7;
|
||||
pub const SGTS: u8 = 8;
|
||||
pub const SGES: u8 = 9;
|
||||
pub const SNES: u8 = 10;
|
||||
pub const FRCS: u8 = 11;
|
||||
pub const TRUNCS: u8 = 12;
|
||||
pub const FLOORS: u8 = 13;
|
||||
pub const EXP: u8 = 14;
|
||||
pub const LOGC: u8 = 15;
|
||||
pub const LOG: u8 = 16;
|
||||
pub const RCPC: u8 = 17;
|
||||
pub const RCPF: u8 = 18;
|
||||
pub const RCP: u8 = 19;
|
||||
pub const RSQC: u8 = 20;
|
||||
pub const RSQF: u8 = 21;
|
||||
pub const RSQ: u8 = 22;
|
||||
pub const MAXAS: u8 = 23;
|
||||
pub const MAXASF: u8 = 24;
|
||||
pub const SUBS: u8 = 25;
|
||||
pub const SUBS_PREV: u8 = 26;
|
||||
pub const SETP_EQ: u8 = 27;
|
||||
pub const SETP_NE: u8 = 28;
|
||||
pub const SETP_GT: u8 = 29;
|
||||
pub const SETP_GE: u8 = 30;
|
||||
pub const SETP_INV: u8 = 31;
|
||||
pub const SETP_POP: u8 = 32;
|
||||
pub const SETP_CLR: u8 = 33;
|
||||
pub const SETP_RSTR: u8 = 34;
|
||||
pub const KILLS_EQ: u8 = 35;
|
||||
pub const KILLS_GT: u8 = 36;
|
||||
pub const KILLS_GE: u8 = 37;
|
||||
pub const KILLS_NE: u8 = 38;
|
||||
pub const KILLS_ONE: u8 = 39;
|
||||
pub const SQRT: u8 = 40;
|
||||
pub const MULSC0: u8 = 42;
|
||||
pub const MULSC1: u8 = 43;
|
||||
pub const ADDSC0: u8 = 44;
|
||||
pub const ADDSC1: u8 = 45;
|
||||
pub const SUBSC0: u8 = 46;
|
||||
pub const SUBSC1: u8 = 47;
|
||||
pub const SIN: u8 = 48;
|
||||
pub const COS: u8 = 49;
|
||||
pub const RETAIN_PREV: u8 = 50;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Regression: our table previously drifted from canary's values (e.g.
|
||||
/// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
|
||||
/// the most-often-used scalar + vector opcodes here.
|
||||
#[test]
|
||||
fn opcodes_match_canary_values() {
|
||||
// Scalar.
|
||||
assert_eq!(sop::MAXS, 5);
|
||||
assert_eq!(sop::MINS, 6);
|
||||
assert_eq!(sop::SEQS, 7);
|
||||
assert_eq!(sop::EXP, 14);
|
||||
assert_eq!(sop::LOG, 16);
|
||||
assert_eq!(sop::RCP, 19);
|
||||
assert_eq!(sop::RSQ, 22);
|
||||
assert_eq!(sop::SUBS, 25);
|
||||
assert_eq!(sop::SETP_EQ, 27);
|
||||
assert_eq!(sop::KILLS_EQ, 35);
|
||||
assert_eq!(sop::SQRT, 40);
|
||||
assert_eq!(sop::SIN, 48);
|
||||
assert_eq!(sop::RETAIN_PREV, 50);
|
||||
// Vector.
|
||||
assert_eq!(vop::SNE, 7);
|
||||
assert_eq!(vop::CND_EQ, 12);
|
||||
assert_eq!(vop::MAX4, 19);
|
||||
assert_eq!(vop::KILL_EQ, 24);
|
||||
assert_eq!(vop::DST, 28);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_extracts_opcodes_and_dests() {
|
||||
// Build a minimal ALU word:
|
||||
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
|
||||
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
|
||||
let w2 = (vop::ADD as u32)
|
||||
| ((sop::RCP as u32) << 6)
|
||||
| (0xF << 12) // vector_write_mask
|
||||
| (3u32 << 16) // vector_dest
|
||||
| (7u32 << 24); // scalar_dest
|
||||
let alu = decode_alu([0, 0, w2]);
|
||||
assert_eq!(alu.vector_opcode, vop::ADD);
|
||||
assert_eq!(alu.scalar_opcode, sop::RCP);
|
||||
assert_eq!(alu.vector_dest, 3);
|
||||
assert_eq!(alu.scalar_dest, 7);
|
||||
assert_eq!(alu.vector_write_mask, 0xF);
|
||||
}
|
||||
}
|
||||
173
crates/xenia-gpu/src/ucode/control_flow.rs
Normal file
173
crates/xenia-gpu/src/ucode/control_flow.rs
Normal file
@@ -0,0 +1,173 @@
|
||||
//! Xenos control-flow clause decoder.
|
||||
//!
|
||||
//! A shader's CF block is a sequence of 48-bit clauses packed two-per-
|
||||
//! three-dword row. Each clause encodes an opcode and type-specific fields
|
||||
//! (exec addr/count, loop start/end, branch target, etc.).
|
||||
//!
|
||||
//! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset
|
||||
//! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`,
|
||||
//! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the
|
||||
//! translator can log + degrade.
|
||||
|
||||
/// Parsed representation of one CF clause.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ControlFlowInstruction {
|
||||
/// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions.
|
||||
Exec {
|
||||
/// Instruction-block dword index where this clause's instructions start,
|
||||
/// expressed in **triple units** (each inst = 3 dwords).
|
||||
address: u32,
|
||||
/// Number of triples to execute.
|
||||
count: u32,
|
||||
/// The ALU-vs-fetch sequence bitmap (2 bits per instruction).
|
||||
sequence: u32,
|
||||
/// True when this clause ends the shader.
|
||||
is_end: bool,
|
||||
/// True if predicated; skip when predicate != predicate_condition.
|
||||
predicated: bool,
|
||||
predicate_condition: bool,
|
||||
},
|
||||
/// `kLoopStart` — begin a `aL` loop referencing a loop constant.
|
||||
LoopStart { address: u32, loop_id: u32 },
|
||||
/// `kLoopEnd` — close the loop; `address` points at the matching start.
|
||||
LoopEnd { address: u32, loop_id: u32 },
|
||||
/// `kCondJmp` — conditional jump to another CF index.
|
||||
CondJmp {
|
||||
target: u32,
|
||||
predicated: bool,
|
||||
predicate_condition: bool,
|
||||
},
|
||||
/// `kCondCall` — call into another CF subroutine.
|
||||
CondCall { target: u32 },
|
||||
/// `kReturn` — return from subroutine.
|
||||
Return,
|
||||
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
||||
Alloc { size: u32, kind: AllocKind },
|
||||
/// Exit the shader (terminal).
|
||||
Exit,
|
||||
/// Unknown / unhandled opcode.
|
||||
Unknown { opcode: u8 },
|
||||
}
|
||||
|
||||
/// Export target types for `kAlloc` clauses.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum AllocKind {
|
||||
Position,
|
||||
Interpolators,
|
||||
Colors,
|
||||
Memexport,
|
||||
Other,
|
||||
}
|
||||
|
||||
impl AllocKind {
|
||||
fn from_bits(b: u32) -> Self {
|
||||
match b & 0x7 {
|
||||
0 => AllocKind::Position,
|
||||
1 => AllocKind::Interpolators,
|
||||
2 => AllocKind::Colors,
|
||||
3 => AllocKind::Memexport,
|
||||
_ => AllocKind::Other,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode one row (three consecutive CF dwords) into two CF clauses.
|
||||
///
|
||||
/// Word layout per canary (`ucode.h:218-256`):
|
||||
/// - word0 + lo16(word1) → CF_A's 48-bit payload
|
||||
/// - hi16(word1) + word2 → CF_B's 48-bit payload
|
||||
///
|
||||
/// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47).
|
||||
pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) {
|
||||
// Build each 48-bit value as u64; LE within the clause.
|
||||
let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32);
|
||||
let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16);
|
||||
(decode_single(a), decode_single(b))
|
||||
}
|
||||
|
||||
fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
// Top 4 bits of the 48-bit payload.
|
||||
let opcode = ((payload >> 44) & 0xF) as u8;
|
||||
// Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
|
||||
// extraction — good enough for the interpreter, which logs unknowns.
|
||||
let predicated = ((payload >> 28) & 1) != 0;
|
||||
let predicate_condition = ((payload >> 29) & 1) != 0;
|
||||
|
||||
match opcode {
|
||||
0 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: false,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
1 => ControlFlowInstruction::Exit,
|
||||
2 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: true,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
6 => ControlFlowInstruction::LoopStart {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
7 => ControlFlowInstruction::LoopEnd {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
8 => ControlFlowInstruction::CondCall {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
},
|
||||
9 => ControlFlowInstruction::Return,
|
||||
10 => ControlFlowInstruction::CondJmp {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
12 => ControlFlowInstruction::Alloc {
|
||||
size: (payload & 0x7) as u32,
|
||||
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
||||
},
|
||||
other => ControlFlowInstruction::Unknown { opcode: other },
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn opcode_exit_decodes() {
|
||||
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
|
||||
let payload: u64 = 1u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||
assert_eq!(cf, ControlFlowInstruction::Exit);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn opcode_exec_end_carries_address_count() {
|
||||
// opcode 2 (ExecEnd), address=4, count=2, sequence=0.
|
||||
let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4;
|
||||
let hi = (payload & 0xFFFF_FFFF) as u32;
|
||||
let lo = ((payload >> 32) & 0xFFFF) as u32;
|
||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||
match cf {
|
||||
ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
is_end,
|
||||
..
|
||||
} => {
|
||||
assert_eq!(address, 4);
|
||||
assert_eq!(count, 2);
|
||||
assert!(is_end);
|
||||
}
|
||||
other => panic!("expected Exec, got {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
117
crates/xenia-gpu/src/ucode/fetch.rs
Normal file
117
crates/xenia-gpu/src/ucode/fetch.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
//! Xenos fetch (vertex + texture) instruction decoder.
|
||||
//!
|
||||
//! Like ALU instructions, fetches are 96 bits (3 dwords). The opcode lives
|
||||
//! in the low 5 bits of word0. We split them into `VertexFetch` and
|
||||
//! `TextureFetch` structurally because their operand layouts differ.
|
||||
//!
|
||||
//! Reference: `xenia-canary/src/xenia/gpu/ucode.h:690-877`.
|
||||
|
||||
/// Decoded fetch instruction.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum FetchInstruction {
|
||||
Vertex(VertexFetch),
|
||||
Texture(TextureFetch),
|
||||
/// Unknown / minor variants we don't model yet.
|
||||
Unknown { opcode: u8, raw: [u32; 3] },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct VertexFetch {
|
||||
/// Vertex fetch constant index (0..=95).
|
||||
pub fetch_const: u8,
|
||||
/// Source register index (vertex index in r#).
|
||||
pub src_register: u8,
|
||||
/// Destination register for the fetched value.
|
||||
pub dest_register: u8,
|
||||
/// 4-bit write mask.
|
||||
pub dest_write_mask: u8,
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct TextureFetch {
|
||||
/// Texture fetch constant index (0..=31).
|
||||
pub fetch_const: u8,
|
||||
pub src_register: u8,
|
||||
pub dest_register: u8,
|
||||
pub dest_write_mask: u8,
|
||||
/// Dimension: 0=1D, 1=2D, 2=3D/stacked, 3=cube.
|
||||
pub dimension: u8,
|
||||
pub raw: [u32; 3],
|
||||
}
|
||||
|
||||
/// Opcodes (low 5 bits of word0). From `ucode.h`.
|
||||
pub mod op {
|
||||
pub const VERTEX_FETCH: u8 = 0x00;
|
||||
pub const TEXTURE_FETCH: u8 = 0x01;
|
||||
pub const GET_TEXTURE_BORDER_COLOR_FRAC: u8 = 0x16;
|
||||
pub const GET_TEXTURE_COMPUTED_LOD: u8 = 0x17;
|
||||
pub const GET_TEXTURE_WEIGHTS: u8 = 0x18;
|
||||
pub const GET_TEXTURE_GRADIENTS: u8 = 0x19;
|
||||
pub const SET_TEXTURE_LOD: u8 = 0x1A;
|
||||
pub const SET_TEXTURE_GRADIENTS_HORZ: u8 = 0x1B;
|
||||
pub const SET_TEXTURE_GRADIENTS_VERT: u8 = 0x1C;
|
||||
}
|
||||
|
||||
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let opcode = (w0 & 0x1F) as u8;
|
||||
match opcode {
|
||||
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
dimension: ((w1 >> 29) & 0x3) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
_ => FetchInstruction::Unknown { opcode, raw: words },
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn decode_vertex_fetch() {
|
||||
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
|
||||
let v = decode_fetch([w0, 0, 0]);
|
||||
match v {
|
||||
FetchInstruction::Vertex(vf) => {
|
||||
assert_eq!(vf.fetch_const, 5);
|
||||
assert_eq!(vf.src_register, 2);
|
||||
assert_eq!(vf.dest_register, 7);
|
||||
}
|
||||
other => panic!("expected Vertex, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_texture_fetch() {
|
||||
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
|
||||
let t = decode_fetch([w0, (2u32 << 29), 0]);
|
||||
match t {
|
||||
FetchInstruction::Texture(tf) => {
|
||||
assert_eq!(tf.fetch_const, 3);
|
||||
assert_eq!(tf.dimension, 2);
|
||||
}
|
||||
other => panic!("expected Texture, got {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_opcode_is_classified() {
|
||||
let v = decode_fetch([0x16, 0, 0]); // GET_TEXTURE_BORDER_COLOR_FRAC
|
||||
assert!(matches!(v, FetchInstruction::Unknown { opcode: 0x16, .. }));
|
||||
}
|
||||
}
|
||||
249
crates/xenia-gpu/src/ucode/mod.rs
Normal file
249
crates/xenia-gpu/src/ucode/mod.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
//! Xenos (ATI R500-family) shader microcode decoder.
|
||||
//!
|
||||
//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
|
||||
//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
|
||||
//! instructions (vector + scalar pipes), and fetch instructions (vertex +
|
||||
//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
|
||||
//! translator comes online in P7, it reuses the same parser.
|
||||
//!
|
||||
//! ## Binary layout
|
||||
//!
|
||||
//! A compiled shader has two sections back-to-back:
|
||||
//!
|
||||
//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
|
||||
//! two clauses into three 32-bit words:
|
||||
//! ```text
|
||||
//! word0 word1 word2
|
||||
//! [-CF_A (48)-][-CF_B (48)-]
|
||||
//! ```
|
||||
//! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
|
||||
//! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
|
||||
//!
|
||||
//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
|
||||
//! instructions. Each control-flow clause of kind `Exec*` references a
|
||||
//! contiguous range of these by `(address, count)` in dwords * 3.
|
||||
//!
|
||||
//! We read big-endian dwords straight out of guest memory (the `raw`
|
||||
//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
|
||||
//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
|
||||
//! layout and `:700-877` for the fetch/ALU mix.
|
||||
|
||||
pub mod alu;
|
||||
pub mod control_flow;
|
||||
pub mod fetch;
|
||||
|
||||
use self::alu::AluInstruction;
|
||||
use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
|
||||
use self::fetch::FetchInstruction;
|
||||
|
||||
/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
|
||||
/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
|
||||
pub mod cf_kind {
|
||||
pub const EXEC: u32 = 0;
|
||||
pub const EXEC_END: u32 = 1;
|
||||
pub const ALLOC: u32 = 2;
|
||||
pub const EXIT: u32 = 3;
|
||||
pub const LOOP_START: u32 = 4;
|
||||
pub const LOOP_END: u32 = 5;
|
||||
pub const COND_JMP: u32 = 6;
|
||||
pub const COND_CALL: u32 = 7;
|
||||
pub const RETURN: u32 = 8;
|
||||
pub const UNKNOWN: u32 = 15;
|
||||
}
|
||||
|
||||
/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
|
||||
pub mod cf_alloc_kind {
|
||||
pub const POSITION: u32 = 0;
|
||||
pub const INTERPOLATORS: u32 = 1;
|
||||
pub const COLORS: u32 = 2;
|
||||
pub const MEMEXPORT: u32 = 3;
|
||||
pub const OTHER: u32 = 4;
|
||||
}
|
||||
|
||||
/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
|
||||
/// interpreter expects:
|
||||
///
|
||||
/// ```text
|
||||
/// [0] cf_count
|
||||
/// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause
|
||||
/// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch)
|
||||
/// ```
|
||||
///
|
||||
/// The CF table lets WGSL walk clauses without reconstructing bit-packed
|
||||
/// layouts on the GPU. Semantics per `kind`:
|
||||
///
|
||||
/// | kind | primary | aux |
|
||||
/// |-------------|----------------------------|------------------------------|
|
||||
/// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count |
|
||||
/// | ALLOC | alloc_kind (see cf_alloc_kind) | size |
|
||||
/// | EXIT | 0 | 0 |
|
||||
/// | LOOP_START | address | loop_id |
|
||||
/// | LOOP_END | address | loop_id |
|
||||
/// | COND_JMP | target | predicate flags |
|
||||
/// | COND_CALL | target | 0 |
|
||||
/// | RETURN | 0 | 0 |
|
||||
/// | UNKNOWN | opcode | 0 |
|
||||
pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
|
||||
let cf_count = parsed.cf.len() as u32;
|
||||
let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
|
||||
out.push(cf_count);
|
||||
for clause in &parsed.cf {
|
||||
let (kind, primary, aux) = encode_cf(*clause);
|
||||
out.push(kind);
|
||||
out.push(primary);
|
||||
out.push(aux);
|
||||
}
|
||||
out.extend_from_slice(&parsed.instructions);
|
||||
out
|
||||
}
|
||||
|
||||
fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
||||
use ControlFlowInstruction::*;
|
||||
match c {
|
||||
Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
is_end,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
} => {
|
||||
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
||||
let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
|
||||
| (pred_bits << 8);
|
||||
(kind, address, (sequence << 8) | count)
|
||||
}
|
||||
Alloc { size, kind } => {
|
||||
let akind = match kind {
|
||||
AllocKind::Position => cf_alloc_kind::POSITION,
|
||||
AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
|
||||
AllocKind::Colors => cf_alloc_kind::COLORS,
|
||||
AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
|
||||
AllocKind::Other => cf_alloc_kind::OTHER,
|
||||
};
|
||||
(cf_kind::ALLOC, akind, size)
|
||||
}
|
||||
Exit => (cf_kind::EXIT, 0, 0),
|
||||
LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
|
||||
LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
|
||||
CondJmp {
|
||||
target,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
} => {
|
||||
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
|
||||
(cf_kind::COND_JMP, target, pred_bits)
|
||||
}
|
||||
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
||||
Return => (cf_kind::RETURN, 0, 0),
|
||||
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
||||
}
|
||||
}
|
||||
|
||||
/// One instruction word set from the instruction-block section. Xenos packs
|
||||
/// ALU and fetch instructions identically (96 bits each); the owning exec
|
||||
/// clause's "sequence" bitmap decides which is which.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DecodedInstruction {
|
||||
/// ALU pipe (vector ALU + optional co-issued scalar ALU).
|
||||
Alu(AluInstruction),
|
||||
/// Vertex or texture fetch.
|
||||
Fetch(FetchInstruction),
|
||||
}
|
||||
|
||||
/// Parsed shader: the control-flow clause list + the raw 32-bit instruction
|
||||
/// words. The uber-shader / translator is expected to index into
|
||||
/// `instructions` based on `(clause.address * 3, clause.count * 3)`.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct ParsedShader {
|
||||
pub cf: Vec<ControlFlowInstruction>,
|
||||
/// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
|
||||
/// instruction; the owning `Exec` clause's `sequence` bitmap picks the
|
||||
/// kind.
|
||||
pub instructions: Vec<u32>,
|
||||
}
|
||||
|
||||
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
||||
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
||||
/// is encoded in the first word's low 12 bits of the last exec clause —
|
||||
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
||||
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
let mut cf = Vec::new();
|
||||
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
||||
// layout). Walk pairs of 3 dwords per pair of clauses.
|
||||
let mut i = 0usize;
|
||||
while i + 2 < raw_dwords.len() {
|
||||
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
||||
let (first, second) = a;
|
||||
let seen_exit = matches!(
|
||||
first,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
) || matches!(
|
||||
second,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
);
|
||||
cf.push(first);
|
||||
cf.push(second);
|
||||
i += 3;
|
||||
if seen_exit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Everything after `i` dwords is the instruction block.
|
||||
let instructions = raw_dwords[i..].to_vec();
|
||||
ParsedShader { cf, instructions }
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn empty_blob_parses_empty() {
|
||||
let p = parse_shader(&[]);
|
||||
assert!(p.cf.is_empty());
|
||||
assert!(p.instructions.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_for_wgsl_layout_is_correct() {
|
||||
// Build a tiny ParsedShader by hand and verify the packed form.
|
||||
let parsed = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0x10,
|
||||
count: 3,
|
||||
sequence: 0b1010,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
ControlFlowInstruction::Exit,
|
||||
],
|
||||
instructions: vec![0x1111, 0x2222, 0x3333],
|
||||
};
|
||||
let packed = pack_for_wgsl(&parsed);
|
||||
assert_eq!(packed[0], 2, "cf_count");
|
||||
// First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
|
||||
assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
|
||||
assert_eq!(packed[2], 0x10);
|
||||
assert_eq!(packed[3], (0b1010 << 8) | 3);
|
||||
// Second clause: EXIT
|
||||
assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
|
||||
// Instruction block starts at 1 + 2*3 = 7
|
||||
assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trivial_exit_clause_stops_parsing() {
|
||||
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
||||
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
||||
let w0 = 0u32; // clause A body
|
||||
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
||||
let w2 = 0u32;
|
||||
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
||||
assert!(!p.cf.is_empty());
|
||||
// Exit detected → remaining dword is instruction data.
|
||||
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user