xenia-gpu: end-to-end Xenos pipeline (PM4, ucode, EDRAM, resolve)

First real GPU implementation. Ring/PM4 frontend (ring_view,
ring_drain, pm4) drains the command processor; gpu_system owns the
threaded backend (DrainFence RPC + parker/fence helpers from M1) and
the MMIO-mapped register block (mmio_region).

Xenos shader frontend: ucode/{alu,control_flow,fetch,mod}.rs decode
the Xbox 360 microcode, translator.rs lowers it onto the WGSL
xenos_interp interpreter shader (shaders/xenos_interp.wgsl).
shader_metrics.rs counts decode/translate work.

Render state: draw_state, primitive, render_target_cache,
texture_cache, tiled_address (Xenos's swizzled tiled-memory layout),
xenos_constants (register field constants), edram (the 10 MiB EDRAM
model with MSAA), and resolve.rs (TILE_FLUSH copy-out — clear-resolve
plus bitwise-equivalent 32 bpp + 64 bpp paths landed). handle.rs
owns the typed GPU-resource handles the kernel hands out.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:29:38 +02:00
parent 5f0d6487ea
commit 79eb52c378
24 changed files with 10984 additions and 18 deletions

View File

@@ -0,0 +1,206 @@
//! Xenos ALU (vector + scalar) instruction decoder.
//!
//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
//! - word0: operand modifier flags + destination info
//! - word1: source register / swizzle fields
//! - word2: opcode + write mask + export target
//!
//! See `ucode.h:900-1400` for the full field map. This decoder captures the
//! minimal shape the uber-shader needs; flags we don't interpret yet are
//! retained as raw bits in `raw` for downstream inspection.
/// Decoded ALU instruction.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct AluInstruction {
/// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
pub vector_opcode: u8,
/// Scalar ALU opcode (bits 7..13 of word2).
pub scalar_opcode: u8,
/// Destination register index for vector result (7 bits).
pub vector_dest: u8,
/// Destination register index for scalar result (7 bits).
pub scalar_dest: u8,
/// 4-bit write mask for the vector result (x/y/z/w).
pub vector_write_mask: u8,
/// 4-bit write mask for the scalar result.
pub scalar_write_mask: u8,
/// Set when the instruction should write to the export bank (position,
/// interpolators, color, etc.) instead of the general register file.
pub vector_dest_is_export: bool,
/// Selects `ps` (previous scalar result) as the scalar operand when set.
pub scalar_src_is_ps: bool,
/// Source register indices (at most 3 for vector ops).
pub src_a: u8,
pub src_b: u8,
pub src_c: u8,
/// Set when the instruction is predicated; skipped if the predicate
/// doesn't match `predicate_condition`.
pub predicated: bool,
pub predicate_condition: bool,
/// Raw dwords — preserved verbatim so the translator / interpreter can
/// reach into fields we haven't parsed explicitly yet.
pub raw: [u32; 3],
}
/// Decode a 3-dword ALU triple.
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
let w0 = words[0];
let _w1 = words[1];
let w2 = words[2];
AluInstruction {
vector_opcode: (w2 & 0x3F) as u8,
scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
vector_dest: ((w2 >> 16) & 0x7F) as u8,
scalar_dest: ((w2 >> 24) & 0x7F) as u8,
vector_write_mask: ((w2 >> 12) & 0xF) as u8,
scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
vector_dest_is_export: ((w2 >> 23) & 1) != 0,
scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
src_a: (w0 & 0xFF) as u8,
src_b: ((w0 >> 8) & 0xFF) as u8,
src_c: ((w0 >> 16) & 0xFF) as u8,
predicated: ((w0 >> 27) & 1) != 0,
predicate_condition: ((w0 >> 28) & 1) != 0,
raw: words,
}
}
/// Vector ALU opcodes we reference by name. Values match canary's
/// `AluVectorOpcode` enum in `ucode.h:1354`.
pub mod vop {
pub const ADD: u8 = 0;
pub const MUL: u8 = 1;
pub const MAX: u8 = 2;
pub const MIN: u8 = 3;
pub const SEQ: u8 = 4;
pub const SGT: u8 = 5;
pub const SGE: u8 = 6;
pub const SNE: u8 = 7;
pub const FRC: u8 = 8;
pub const TRUNC: u8 = 9;
pub const FLOOR: u8 = 10;
pub const MAD: u8 = 11;
pub const CND_EQ: u8 = 12;
pub const CND_GE: u8 = 13;
pub const CND_GT: u8 = 14;
pub const DOT4: u8 = 15;
pub const DOT3: u8 = 16;
pub const DOT2_ADD: u8 = 17;
pub const CUBE: u8 = 18;
pub const MAX4: u8 = 19;
pub const SETP_EQ_PUSH: u8 = 20;
pub const SETP_NE_PUSH: u8 = 21;
pub const SETP_GT_PUSH: u8 = 22;
pub const SETP_GE_PUSH: u8 = 23;
pub const KILL_EQ: u8 = 24;
pub const KILL_GT: u8 = 25;
pub const KILL_GE: u8 = 26;
pub const KILL_NE: u8 = 27;
pub const DST: u8 = 28;
pub const MAX_A: u8 = 29;
}
/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
/// `ucode.h:1001`.
pub mod sop {
pub const ADDS: u8 = 0;
pub const ADDS_PREV: u8 = 1;
pub const MULS: u8 = 2;
pub const MULS_PREV: u8 = 3;
pub const MULS_PREV2: u8 = 4;
pub const MAXS: u8 = 5;
pub const MINS: u8 = 6;
pub const SEQS: u8 = 7;
pub const SGTS: u8 = 8;
pub const SGES: u8 = 9;
pub const SNES: u8 = 10;
pub const FRCS: u8 = 11;
pub const TRUNCS: u8 = 12;
pub const FLOORS: u8 = 13;
pub const EXP: u8 = 14;
pub const LOGC: u8 = 15;
pub const LOG: u8 = 16;
pub const RCPC: u8 = 17;
pub const RCPF: u8 = 18;
pub const RCP: u8 = 19;
pub const RSQC: u8 = 20;
pub const RSQF: u8 = 21;
pub const RSQ: u8 = 22;
pub const MAXAS: u8 = 23;
pub const MAXASF: u8 = 24;
pub const SUBS: u8 = 25;
pub const SUBS_PREV: u8 = 26;
pub const SETP_EQ: u8 = 27;
pub const SETP_NE: u8 = 28;
pub const SETP_GT: u8 = 29;
pub const SETP_GE: u8 = 30;
pub const SETP_INV: u8 = 31;
pub const SETP_POP: u8 = 32;
pub const SETP_CLR: u8 = 33;
pub const SETP_RSTR: u8 = 34;
pub const KILLS_EQ: u8 = 35;
pub const KILLS_GT: u8 = 36;
pub const KILLS_GE: u8 = 37;
pub const KILLS_NE: u8 = 38;
pub const KILLS_ONE: u8 = 39;
pub const SQRT: u8 = 40;
pub const MULSC0: u8 = 42;
pub const MULSC1: u8 = 43;
pub const ADDSC0: u8 = 44;
pub const ADDSC1: u8 = 45;
pub const SUBSC0: u8 = 46;
pub const SUBSC1: u8 = 47;
pub const SIN: u8 = 48;
pub const COS: u8 = 49;
pub const RETAIN_PREV: u8 = 50;
}
#[cfg(test)]
mod tests {
use super::*;
/// Regression: our table previously drifted from canary's values (e.g.
/// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
/// the most-often-used scalar + vector opcodes here.
#[test]
fn opcodes_match_canary_values() {
// Scalar.
assert_eq!(sop::MAXS, 5);
assert_eq!(sop::MINS, 6);
assert_eq!(sop::SEQS, 7);
assert_eq!(sop::EXP, 14);
assert_eq!(sop::LOG, 16);
assert_eq!(sop::RCP, 19);
assert_eq!(sop::RSQ, 22);
assert_eq!(sop::SUBS, 25);
assert_eq!(sop::SETP_EQ, 27);
assert_eq!(sop::KILLS_EQ, 35);
assert_eq!(sop::SQRT, 40);
assert_eq!(sop::SIN, 48);
assert_eq!(sop::RETAIN_PREV, 50);
// Vector.
assert_eq!(vop::SNE, 7);
assert_eq!(vop::CND_EQ, 12);
assert_eq!(vop::MAX4, 19);
assert_eq!(vop::KILL_EQ, 24);
assert_eq!(vop::DST, 28);
}
#[test]
fn decode_extracts_opcodes_and_dests() {
// Build a minimal ALU word:
// vector_opcode = ADD (0), scalar_opcode = RCP (22),
// vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
let w2 = (vop::ADD as u32)
| ((sop::RCP as u32) << 6)
| (0xF << 12) // vector_write_mask
| (3u32 << 16) // vector_dest
| (7u32 << 24); // scalar_dest
let alu = decode_alu([0, 0, w2]);
assert_eq!(alu.vector_opcode, vop::ADD);
assert_eq!(alu.scalar_opcode, sop::RCP);
assert_eq!(alu.vector_dest, 3);
assert_eq!(alu.scalar_dest, 7);
assert_eq!(alu.vector_write_mask, 0xF);
}
}

View File

@@ -0,0 +1,173 @@
//! Xenos control-flow clause decoder.
//!
//! A shader's CF block is a sequence of 48-bit clauses packed two-per-
//! three-dword row. Each clause encodes an opcode and type-specific fields
//! (exec addr/count, loop start/end, branch target, etc.).
//!
//! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset
//! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`,
//! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the
//! translator can log + degrade.
/// Parsed representation of one CF clause.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ControlFlowInstruction {
/// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions.
Exec {
/// Instruction-block dword index where this clause's instructions start,
/// expressed in **triple units** (each inst = 3 dwords).
address: u32,
/// Number of triples to execute.
count: u32,
/// The ALU-vs-fetch sequence bitmap (2 bits per instruction).
sequence: u32,
/// True when this clause ends the shader.
is_end: bool,
/// True if predicated; skip when predicate != predicate_condition.
predicated: bool,
predicate_condition: bool,
},
/// `kLoopStart` — begin a `aL` loop referencing a loop constant.
LoopStart { address: u32, loop_id: u32 },
/// `kLoopEnd` — close the loop; `address` points at the matching start.
LoopEnd { address: u32, loop_id: u32 },
/// `kCondJmp` — conditional jump to another CF index.
CondJmp {
target: u32,
predicated: bool,
predicate_condition: bool,
},
/// `kCondCall` — call into another CF subroutine.
CondCall { target: u32 },
/// `kReturn` — return from subroutine.
Return,
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
Alloc { size: u32, kind: AllocKind },
/// Exit the shader (terminal).
Exit,
/// Unknown / unhandled opcode.
Unknown { opcode: u8 },
}
/// Export target types for `kAlloc` clauses.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AllocKind {
Position,
Interpolators,
Colors,
Memexport,
Other,
}
impl AllocKind {
fn from_bits(b: u32) -> Self {
match b & 0x7 {
0 => AllocKind::Position,
1 => AllocKind::Interpolators,
2 => AllocKind::Colors,
3 => AllocKind::Memexport,
_ => AllocKind::Other,
}
}
}
/// Decode one row (three consecutive CF dwords) into two CF clauses.
///
/// Word layout per canary (`ucode.h:218-256`):
/// - word0 + lo16(word1) → CF_A's 48-bit payload
/// - hi16(word1) + word2 → CF_B's 48-bit payload
///
/// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47).
pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) {
// Build each 48-bit value as u64; LE within the clause.
let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32);
let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16);
(decode_single(a), decode_single(b))
}
fn decode_single(payload: u64) -> ControlFlowInstruction {
// Top 4 bits of the 48-bit payload.
let opcode = ((payload >> 44) & 0xF) as u8;
// Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
// extraction — good enough for the interpreter, which logs unknowns.
let predicated = ((payload >> 28) & 1) != 0;
let predicate_condition = ((payload >> 29) & 1) != 0;
match opcode {
0 => ControlFlowInstruction::Exec {
address: (payload & 0xFFF) as u32,
count: ((payload >> 12) & 0x7) as u32,
sequence: ((payload >> 16) & 0xFFF) as u32,
is_end: false,
predicated,
predicate_condition,
},
1 => ControlFlowInstruction::Exit,
2 => ControlFlowInstruction::Exec {
address: (payload & 0xFFF) as u32,
count: ((payload >> 12) & 0x7) as u32,
sequence: ((payload >> 16) & 0xFFF) as u32,
is_end: true,
predicated,
predicate_condition,
},
6 => ControlFlowInstruction::LoopStart {
address: (payload & 0x3FF) as u32,
loop_id: ((payload >> 16) & 0x1F) as u32,
},
7 => ControlFlowInstruction::LoopEnd {
address: (payload & 0x3FF) as u32,
loop_id: ((payload >> 16) & 0x1F) as u32,
},
8 => ControlFlowInstruction::CondCall {
target: (payload & 0x3FF) as u32,
},
9 => ControlFlowInstruction::Return,
10 => ControlFlowInstruction::CondJmp {
target: (payload & 0x3FF) as u32,
predicated,
predicate_condition,
},
12 => ControlFlowInstruction::Alloc {
size: (payload & 0x7) as u32,
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
},
other => ControlFlowInstruction::Unknown { opcode: other },
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn opcode_exit_decodes() {
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
let payload: u64 = 1u64 << 44;
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
let cf = decode_cf_pair(hi, lo, 0).0;
assert_eq!(cf, ControlFlowInstruction::Exit);
}
#[test]
fn opcode_exec_end_carries_address_count() {
// opcode 2 (ExecEnd), address=4, count=2, sequence=0.
let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4;
let hi = (payload & 0xFFFF_FFFF) as u32;
let lo = ((payload >> 32) & 0xFFFF) as u32;
let cf = decode_cf_pair(hi, lo, 0).0;
match cf {
ControlFlowInstruction::Exec {
address,
count,
is_end,
..
} => {
assert_eq!(address, 4);
assert_eq!(count, 2);
assert!(is_end);
}
other => panic!("expected Exec, got {other:?}"),
}
}
}

View File

@@ -0,0 +1,117 @@
//! Xenos fetch (vertex + texture) instruction decoder.
//!
//! Like ALU instructions, fetches are 96 bits (3 dwords). The opcode lives
//! in the low 5 bits of word0. We split them into `VertexFetch` and
//! `TextureFetch` structurally because their operand layouts differ.
//!
//! Reference: `xenia-canary/src/xenia/gpu/ucode.h:690-877`.
/// Decoded fetch instruction.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FetchInstruction {
Vertex(VertexFetch),
Texture(TextureFetch),
/// Unknown / minor variants we don't model yet.
Unknown { opcode: u8, raw: [u32; 3] },
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct VertexFetch {
/// Vertex fetch constant index (0..=95).
pub fetch_const: u8,
/// Source register index (vertex index in r#).
pub src_register: u8,
/// Destination register for the fetched value.
pub dest_register: u8,
/// 4-bit write mask.
pub dest_write_mask: u8,
pub raw: [u32; 3],
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct TextureFetch {
/// Texture fetch constant index (0..=31).
pub fetch_const: u8,
pub src_register: u8,
pub dest_register: u8,
pub dest_write_mask: u8,
/// Dimension: 0=1D, 1=2D, 2=3D/stacked, 3=cube.
pub dimension: u8,
pub raw: [u32; 3],
}
/// Opcodes (low 5 bits of word0). From `ucode.h`.
pub mod op {
pub const VERTEX_FETCH: u8 = 0x00;
pub const TEXTURE_FETCH: u8 = 0x01;
pub const GET_TEXTURE_BORDER_COLOR_FRAC: u8 = 0x16;
pub const GET_TEXTURE_COMPUTED_LOD: u8 = 0x17;
pub const GET_TEXTURE_WEIGHTS: u8 = 0x18;
pub const GET_TEXTURE_GRADIENTS: u8 = 0x19;
pub const SET_TEXTURE_LOD: u8 = 0x1A;
pub const SET_TEXTURE_GRADIENTS_HORZ: u8 = 0x1B;
pub const SET_TEXTURE_GRADIENTS_VERT: u8 = 0x1C;
}
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
let w0 = words[0];
let w1 = words[1];
let opcode = (w0 & 0x1F) as u8;
match opcode {
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
fetch_const: ((w0 >> 5) & 0x1F) as u8,
src_register: ((w0 >> 17) & 0x7F) as u8,
dest_register: ((w0 >> 10) & 0x7F) as u8,
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
raw: words,
}),
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
fetch_const: ((w0 >> 5) & 0x1F) as u8,
src_register: ((w0 >> 17) & 0x7F) as u8,
dest_register: ((w0 >> 10) & 0x7F) as u8,
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
dimension: ((w1 >> 29) & 0x3) as u8,
raw: words,
}),
_ => FetchInstruction::Unknown { opcode, raw: words },
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decode_vertex_fetch() {
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
let v = decode_fetch([w0, 0, 0]);
match v {
FetchInstruction::Vertex(vf) => {
assert_eq!(vf.fetch_const, 5);
assert_eq!(vf.src_register, 2);
assert_eq!(vf.dest_register, 7);
}
other => panic!("expected Vertex, got {other:?}"),
}
}
#[test]
fn decode_texture_fetch() {
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
let t = decode_fetch([w0, (2u32 << 29), 0]);
match t {
FetchInstruction::Texture(tf) => {
assert_eq!(tf.fetch_const, 3);
assert_eq!(tf.dimension, 2);
}
other => panic!("expected Texture, got {other:?}"),
}
}
#[test]
fn unknown_opcode_is_classified() {
let v = decode_fetch([0x16, 0, 0]); // GET_TEXTURE_BORDER_COLOR_FRAC
assert!(matches!(v, FetchInstruction::Unknown { opcode: 0x16, .. }));
}
}

View File

@@ -0,0 +1,249 @@
//! Xenos (ATI R500-family) shader microcode decoder.
//!
//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
//! instructions (vector + scalar pipes), and fetch instructions (vertex +
//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
//! translator comes online in P7, it reuses the same parser.
//!
//! ## Binary layout
//!
//! A compiled shader has two sections back-to-back:
//!
//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
//! two clauses into three 32-bit words:
//! ```text
//! word0 word1 word2
//! [-CF_A (48)-][-CF_B (48)-]
//! ```
//! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
//! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
//!
//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
//! instructions. Each control-flow clause of kind `Exec*` references a
//! contiguous range of these by `(address, count)` in dwords * 3.
//!
//! We read big-endian dwords straight out of guest memory (the `raw`
//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
//! layout and `:700-877` for the fetch/ALU mix.
pub mod alu;
pub mod control_flow;
pub mod fetch;
use self::alu::AluInstruction;
use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
use self::fetch::FetchInstruction;
/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
pub mod cf_kind {
pub const EXEC: u32 = 0;
pub const EXEC_END: u32 = 1;
pub const ALLOC: u32 = 2;
pub const EXIT: u32 = 3;
pub const LOOP_START: u32 = 4;
pub const LOOP_END: u32 = 5;
pub const COND_JMP: u32 = 6;
pub const COND_CALL: u32 = 7;
pub const RETURN: u32 = 8;
pub const UNKNOWN: u32 = 15;
}
/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
pub mod cf_alloc_kind {
pub const POSITION: u32 = 0;
pub const INTERPOLATORS: u32 = 1;
pub const COLORS: u32 = 2;
pub const MEMEXPORT: u32 = 3;
pub const OTHER: u32 = 4;
}
/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
/// interpreter expects:
///
/// ```text
/// [0] cf_count
/// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause
/// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch)
/// ```
///
/// The CF table lets WGSL walk clauses without reconstructing bit-packed
/// layouts on the GPU. Semantics per `kind`:
///
/// | kind | primary | aux |
/// |-------------|----------------------------|------------------------------|
/// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count |
/// | ALLOC | alloc_kind (see cf_alloc_kind) | size |
/// | EXIT | 0 | 0 |
/// | LOOP_START | address | loop_id |
/// | LOOP_END | address | loop_id |
/// | COND_JMP | target | predicate flags |
/// | COND_CALL | target | 0 |
/// | RETURN | 0 | 0 |
/// | UNKNOWN | opcode | 0 |
pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
let cf_count = parsed.cf.len() as u32;
let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
out.push(cf_count);
for clause in &parsed.cf {
let (kind, primary, aux) = encode_cf(*clause);
out.push(kind);
out.push(primary);
out.push(aux);
}
out.extend_from_slice(&parsed.instructions);
out
}
fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
use ControlFlowInstruction::*;
match c {
Exec {
address,
count,
sequence,
is_end,
predicated,
predicate_condition,
} => {
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
| (pred_bits << 8);
(kind, address, (sequence << 8) | count)
}
Alloc { size, kind } => {
let akind = match kind {
AllocKind::Position => cf_alloc_kind::POSITION,
AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
AllocKind::Colors => cf_alloc_kind::COLORS,
AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
AllocKind::Other => cf_alloc_kind::OTHER,
};
(cf_kind::ALLOC, akind, size)
}
Exit => (cf_kind::EXIT, 0, 0),
LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
CondJmp {
target,
predicated,
predicate_condition,
} => {
let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
(cf_kind::COND_JMP, target, pred_bits)
}
CondCall { target } => (cf_kind::COND_CALL, target, 0),
Return => (cf_kind::RETURN, 0, 0),
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
}
}
/// One instruction word set from the instruction-block section. Xenos packs
/// ALU and fetch instructions identically (96 bits each); the owning exec
/// clause's "sequence" bitmap decides which is which.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DecodedInstruction {
/// ALU pipe (vector ALU + optional co-issued scalar ALU).
Alu(AluInstruction),
/// Vertex or texture fetch.
Fetch(FetchInstruction),
}
/// Parsed shader: the control-flow clause list + the raw 32-bit instruction
/// words. The uber-shader / translator is expected to index into
/// `instructions` based on `(clause.address * 3, clause.count * 3)`.
#[derive(Debug, Clone, Default)]
pub struct ParsedShader {
pub cf: Vec<ControlFlowInstruction>,
/// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
/// instruction; the owning `Exec` clause's `sequence` bitmap picks the
/// kind.
pub instructions: Vec<u32>,
}
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
/// is encoded in the first word's low 12 bits of the last exec clause —
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
let mut cf = Vec::new();
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
// layout). Walk pairs of 3 dwords per pair of clauses.
let mut i = 0usize;
while i + 2 < raw_dwords.len() {
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
let (first, second) = a;
let seen_exit = matches!(
first,
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
) || matches!(
second,
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
);
cf.push(first);
cf.push(second);
i += 3;
if seen_exit {
break;
}
}
// Everything after `i` dwords is the instruction block.
let instructions = raw_dwords[i..].to_vec();
ParsedShader { cf, instructions }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_blob_parses_empty() {
let p = parse_shader(&[]);
assert!(p.cf.is_empty());
assert!(p.instructions.is_empty());
}
#[test]
fn pack_for_wgsl_layout_is_correct() {
// Build a tiny ParsedShader by hand and verify the packed form.
let parsed = ParsedShader {
cf: vec![
ControlFlowInstruction::Exec {
address: 0x10,
count: 3,
sequence: 0b1010,
is_end: false,
predicated: false,
predicate_condition: false,
},
ControlFlowInstruction::Exit,
],
instructions: vec![0x1111, 0x2222, 0x3333],
};
let packed = pack_for_wgsl(&parsed);
assert_eq!(packed[0], 2, "cf_count");
// First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
assert_eq!(packed[2], 0x10);
assert_eq!(packed[3], (0b1010 << 8) | 3);
// Second clause: EXIT
assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
// Instruction block starts at 1 + 2*3 = 7
assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
}
#[test]
fn trivial_exit_clause_stops_parsing() {
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
let w0 = 0u32; // clause A body
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
let w2 = 0u32;
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
assert!(!p.cf.is_empty());
// Exit detected → remaining dword is instruction data.
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
}
}