//! Xenos (ATI R500-family) shader microcode decoder. //! //! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a //! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU //! instructions (vector + scalar pipes), and fetch instructions (vertex + //! texture). The uber-shader consumes this IR directly; when a WGSL-emitting //! translator comes online in P7, it reuses the same parser. //! //! ## Binary layout //! //! A compiled shader has two sections back-to-back: //! //! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs //! two clauses into three 32-bit words: //! ```text //! word0 word1 word2 //! [-CF_A (48)-][-CF_B (48)-] //! ``` //! Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and //! its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits. //! //! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch //! instructions. Each control-flow clause of kind `Exec*` references a //! contiguous range of these by `(address, count)` in dwords * 3. //! //! We read big-endian dwords straight out of guest memory (the `raw` //! `&[u32]` slice is already host-endian-corrected by the PM4 executor that //! cached the shader blob). See `ucode.h:218-256` for the exec clause bit //! layout and `:700-877` for the fetch/ALU mix. pub mod alu; pub mod control_flow; pub mod fetch; use self::alu::AluInstruction; use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair}; use self::fetch::FetchInstruction; /// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept /// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants. pub mod cf_kind { pub const EXEC: u32 = 0; pub const EXEC_END: u32 = 1; pub const ALLOC: u32 = 2; pub const EXIT: u32 = 3; pub const LOOP_START: u32 = 4; pub const LOOP_END: u32 = 5; pub const COND_JMP: u32 = 6; pub const COND_CALL: u32 = 7; pub const RETURN: u32 = 8; /// Non-executing CF clause: `kNop` padding or `kMarkVsFetchDone` hint. /// The WGSL CF walker treats this as a no-op (advance, do not reject). pub const NOP: u32 = 9; pub const UNKNOWN: u32 = 15; } /// Alloc-kind codes, packed into the aux dword of an `Alloc` clause. pub mod cf_alloc_kind { pub const POSITION: u32 = 0; pub const INTERPOLATORS: u32 = 1; pub const COLORS: u32 = 2; pub const MEMEXPORT: u32 = 3; pub const OTHER: u32 = 4; } /// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime /// interpreter expects: /// /// ```text /// [0] cf_count /// [1 .. 1 + cf_count*3] CF table: (kind, primary, aux) triples per clause /// [1 + cf_count*3 ..] raw 3-dword instruction stream (ALU/fetch) /// ``` /// /// The CF table lets WGSL walk clauses without reconstructing bit-packed /// layouts on the GPU. Semantics per `kind`: /// /// | kind | primary | aux | /// |-------------|----------------------------|------------------------------| /// | EXEC/EXEC_END | address (in triples) | (sequence<<8) \| count | /// | ALLOC | alloc_kind (see cf_alloc_kind) | size | /// | EXIT | 0 | 0 | /// | LOOP_START | address | loop_id | /// | LOOP_END | address | loop_id | /// | COND_JMP | target | predicate flags | /// | COND_CALL | target | 0 | /// | RETURN | 0 | 0 | /// | UNKNOWN | opcode | 0 | pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec { let cf_count = parsed.cf.len() as u32; let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len()); out.push(cf_count); for clause in &parsed.cf { let (kind, primary, aux) = encode_cf(*clause); out.push(kind); out.push(primary); out.push(aux); } out.extend_from_slice(&parsed.instructions); out } fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) { use ControlFlowInstruction::*; match c { Exec { address, count, sequence, is_end, predicated, predicate_condition, } => { let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1); let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC } | (pred_bits << 8); (kind, address, (sequence << 8) | count) } Alloc { size, kind } => { let akind = match kind { AllocKind::Position => cf_alloc_kind::POSITION, AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS, AllocKind::Colors => cf_alloc_kind::COLORS, AllocKind::Memexport => cf_alloc_kind::MEMEXPORT, AllocKind::Other => cf_alloc_kind::OTHER, }; (cf_kind::ALLOC, akind, size) } Exit => (cf_kind::EXIT, 0, 0), LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id), LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id), CondJmp { target, predicated, predicate_condition, } => { let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1); (cf_kind::COND_JMP, target, pred_bits) } CondCall { target } => (cf_kind::COND_CALL, target, 0), Return => (cf_kind::RETURN, 0, 0), Nop | MarkVsFetchDone => (cf_kind::NOP, 0, 0), Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0), } } /// One instruction word set from the instruction-block section. Xenos packs /// ALU and fetch instructions identically (96 bits each); the owning exec /// clause's "sequence" bitmap decides which is which. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DecodedInstruction { /// ALU pipe (vector ALU + optional co-issued scalar ALU). Alu(AluInstruction), /// Vertex or texture fetch. Fetch(FetchInstruction), } /// Parsed shader: the control-flow clause list + the raw 32-bit instruction /// words. The uber-shader / translator is expected to index into /// `instructions` based on `(clause.address * 3, clause.count * 3)`. #[derive(Debug, Clone, Default)] pub struct ParsedShader { pub cf: Vec, /// Raw instruction dwords. Each 3-dword triple is one ALU or fetch /// instruction; the owning `Exec` clause's `sequence` bitmap picks the /// kind. pub instructions: Vec, } /// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire /// microcode buffer (control flow + instructions). The CF block is implicitly /// bounded: we walk clause-pair rows until one terminates the shader (an /// `Exec`/`CondExec` clause with the END bit set, per Xenos). Everything after /// that row is the instruction block; exec/loop addresses are then rebased to /// be relative to it. pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader { let mut cf = Vec::new(); // CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's // layout). Walk pairs of 3 dwords per pair of clauses. let mut i = 0usize; while i + 2 < raw_dwords.len() { let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]); let (first, second) = a; // The CF block ends after the clause that terminates the shader: an // `Exec` with the END bit set (Xenos `kExecEnd`/`kCondExec*End`), a // synthetic `Exit`, or an `Unknown` opcode (decode ran off the CF // block into instruction data — stop defensively). `Nop` padding // does NOT terminate. (Previously this stopped on the first `Exit`, // but with the corrected opcode table opcode 1 is `kExec`, not exit, // so real exec clauses kept the parse going as intended.) let terminates = |cf: &ControlFlowInstruction| { matches!( cf, ControlFlowInstruction::Exec { is_end: true, .. } | ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. } ) }; let seen_end = terminates(&first) || terminates(&second); cf.push(first); cf.push(second); i += 3; if seen_end { break; } } // Everything after `i` dwords is the instruction block. let instructions = raw_dwords[i..].to_vec(); // Xenos exec/loop `address` fields are absolute instruction-triple indices // counted from shader dword 0, but `instructions` here begins *after* the // CF block. Rebase those addresses to be relative to the instruction block // (subtract the CF triple count) so `address * 3` indexes `instructions` // directly. (Without this, every exec read 3 dwords too far per CF triple — // the publisher-logo `tfetch` triple was skipped → flat splash.) let cf_triples = (i / 3) as u32; for clause in cf.iter_mut() { match clause { ControlFlowInstruction::Exec { address, .. } => { *address = address.saturating_sub(cf_triples); } ControlFlowInstruction::LoopStart { address, .. } | ControlFlowInstruction::LoopEnd { address, .. } => { *address = address.saturating_sub(cf_triples); } _ => {} } } ParsedShader { cf, instructions } } #[cfg(test)] mod tests { use super::*; #[test] fn empty_blob_parses_empty() { let p = parse_shader(&[]); assert!(p.cf.is_empty()); assert!(p.instructions.is_empty()); } #[test] fn pack_for_wgsl_layout_is_correct() { // Build a tiny ParsedShader by hand and verify the packed form. let parsed = ParsedShader { cf: vec![ ControlFlowInstruction::Exec { address: 0x10, count: 3, sequence: 0b1010, is_end: false, predicated: false, predicate_condition: false, }, ControlFlowInstruction::Exit, ], instructions: vec![0x1111, 0x2222, 0x3333], }; let packed = pack_for_wgsl(&parsed); assert_eq!(packed[0], 2, "cf_count"); // First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03 assert_eq!(packed[1] & 0xFF, cf_kind::EXEC); assert_eq!(packed[2], 0x10); assert_eq!(packed[3], (0b1010 << 8) | 3); // Second clause: EXIT assert_eq!(packed[4] & 0xFF, cf_kind::EXIT); // Instruction block starts at 1 + 2*3 = 7 assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]); } #[test] fn exec_end_clause_stops_parsing() { // Row: clause B = kExecEnd (opcode 2) terminates the CF block. // 48-bit payload of B occupies hi16(word1) + word2; opcode lives in // bits 44..47 of that payload. Put opcode 2 there: payload bit 44 set // for the `2` → (2 << 44). In B's framing, bits 16..47 come from // word2, so word2 bit (44-16)=28 region holds the opcode nibble. let b_payload: u64 = 2u64 << 44; // kExecEnd // B = lo16 from hi16(word1), hi from word2. Reconstruct word1/word2. let word1 = ((b_payload & 0xFFFF) as u32) << 16; // B's low 16 bits → hi16(word1) let word2 = ((b_payload >> 16) & 0xFFFF_FFFF) as u32; let p = parse_shader(&[0, word1, word2, 0xDEAD_BEEF]); assert!(!p.cf.is_empty()); // ExecEnd detected in the first row → remaining dword is instruction data. assert_eq!(p.instructions, vec![0xDEAD_BEEF]); } }