From 6bb4355e3d50084af76ac2722eb580e98ddf36c9 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Wed, 17 Jun 2026 21:53:35 +0200 Subject: [PATCH] [iterate-3M] Fix Xenos shader CF/fetch decode so the textured logo binds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The publisher splash (title idx0) rendered FLAT in ours while canary samples a texture: ours never decoded the logo's textured pixel shader (E59B2B3D, a `tfetch2D` sprite) even though our guest IM_LOADs the exact same microcode canary does (verified byte-identical against the Wine oracle). The shader was misparsed as flat. Three coupled bugs in the ucode decoder, all off vs canary `gpu/ucode.h`: 1. CF opcode table was off-by-one (`control_flow.rs`): mapped opcode 0→Exec and 1→Exit, but Xenos has 0=kNop, 1=kExec, 2=kExecEnd, 3..6/13..14 the cond-exec variants, 7/8 loop, 9/10 call/return, 11 condjmp, 12 alloc, 15 mark-vs-fetch-done. So a real `kExec` clause was read as a terminal `Exit`, truncating the CF block and dropping every instruction (incl. the `tfetch`) after it. Added Nop/MarkVsFetchDone variants; parse now ends on an END-bit exec clause. 2. exec/loop `address` is an absolute instruction-triple index from shader dword 0, but indexed our post-CF `instructions` slice directly (`ucode/mod.rs`). Rebase addresses by the CF triple count so `address*3` lands on the right instruction. 3. Fetch instruction bitfields were wrong (`ucode/fetch.rs`): `const_index` read from bit 5 (actually `src_reg`) instead of bit 20, and texture `dimension` from dword1 instead of dword2 bit14. The logo's `tfetch ..,tf0` was read as `tf1`, whose empty fetch-constant failed to decode → no texture. Also the `sequence` fetch/ALU bit is bit[0] of each pair, not bit[1] (`shader_metrics.rs`, `translator.rs`, `xenos_interp.wgsl`). Result (--gpu-inline, deterministic 2x): the active PS's `tfetch_slots` now resolves slot 0, the tf0 fetch-constant decodes (fmt K8888), and `gpu.texture.decode` fires (137x at -n 50M; texture_cache_entries 0→1, the only golden field that changed — all draw/swap counts unchanged). The same fixes correct the WGSL uber-shader's fetch/CF walk for the threaded/--ui path. Added a regression test that parses the real E59B2B3D microcode and asserts a tfetch slot is found. Golden re-baselined (texture_cache_entries 0→1). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../xenia-app/tests/golden/sylpheed_n50m.json | 2 +- crates/xenia-gpu/src/shader_metrics.rs | 22 ++-- .../xenia-gpu/src/shaders/xenos_interp.wgsl | 30 ++--- crates/xenia-gpu/src/translator.rs | 8 +- crates/xenia-gpu/src/ucode/control_flow.rs | 110 +++++++++++++----- crates/xenia-gpu/src/ucode/fetch.rs | 41 ++++--- crates/xenia-gpu/src/ucode/mod.rs | 76 +++++++++--- 7 files changed, 205 insertions(+), 84 deletions(-) diff --git a/crates/xenia-app/tests/golden/sylpheed_n50m.json b/crates/xenia-app/tests/golden/sylpheed_n50m.json index 457bac7..9687b99 100644 --- a/crates/xenia-app/tests/golden/sylpheed_n50m.json +++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json @@ -6,5 +6,5 @@ "swaps": 147, "unique_render_targets": 2, "shader_blobs_live": 6, - "texture_cache_entries": 0 + "texture_cache_entries": 1 } diff --git a/crates/xenia-gpu/src/shader_metrics.rs b/crates/xenia-gpu/src/shader_metrics.rs index c5d866b..5448a11 100644 --- a/crates/xenia-gpu/src/shader_metrics.rs +++ b/crates/xenia-gpu/src/shader_metrics.rs @@ -45,8 +45,9 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) { parsed.instructions[base + 1], parsed.instructions[base + 2], ]; - // sequence bit layout: 2 bits per triple, hi bit = is-fetch. - let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0; + // sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0), + // bit[1]=serialize (Xenos `ucode.h:226`). + let is_fetch = ((sequence >> (i * 2)) & 1) != 0; if is_fetch { match decode_fetch(words) { FetchInstruction::Vertex(_) => vfetch_count += 1, @@ -196,8 +197,9 @@ pub fn tfetch_slots(parsed: &ParsedShader) -> Vec { if base + 2 >= parsed.instructions.len() { break; } - // sequence bit layout: 2 bits per triple, hi bit = is-fetch. - let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0; + // sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0), + // bit[1]=serialize (Xenos `ucode.h:226`). + let is_fetch = ((sequence >> (i * 2)) & 1) != 0; if !is_fetch { continue; } @@ -345,17 +347,17 @@ mod tests { /// fetch (and dedup), and return empty for a flat ALU-only shader. #[test] fn tfetch_slots_extracts_texture_fetch_constants() { - // word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, fetch_const=3 in - // bits[9:5] → 0x01 | (3 << 5) = 0x61. - let tfetch_w0: u32 = 0x01 | (3u32 << 5); + // word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, const_index=3 in + // bits[24:20] (Xenos `ucode.h:844`) → 0x01 | (3 << 20). + let tfetch_w0: u32 = 0x01 | (3u32 << 20); let shader = ParsedShader { cf: vec![ ControlFlowInstruction::Exec { address: 0, count: 2, - // triple 0 is a fetch (hi bit of its 2-bit field set), - // triple 1 is ALU. is_fetch = (sequence >> (i*2+1)) & 1. - sequence: 0b00_10, + // instruction 0 is a fetch (bit[0] of its 2-bit field set), + // instruction 1 is ALU. is_fetch = (sequence >> (i*2)) & 1. + sequence: 0b00_01, is_end: false, predicated: false, predicate_condition: false, diff --git a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl index 6e97d86..7b1d969 100644 --- a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl +++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl @@ -56,6 +56,7 @@ const CF_KIND_LOOP_END: u32 = 5u; const CF_KIND_COND_JMP: u32 = 6u; const CF_KIND_COND_CALL: u32 = 7u; const CF_KIND_RETURN: u32 = 8u; +const CF_KIND_NOP: u32 = 9u; const CF_KIND_UNKNOWN: u32 = 15u; // ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ────── @@ -628,8 +629,8 @@ const VFMT_32_32_32_FLOAT: u32 = 57u; // layout in `ucode.h:690`): // w0 [4:0] opcode // w0 [10:5] src_reg[5:0] -// w0 [17:11] dst_reg[6:0] + must-be-one -// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0] +// w0 [17:12] dst_reg[5:0] +// w0 [24:20] const_index[4:0], [26:25] const_index_sel[1:0] // w1 [21:16] format[5:0] // w2 [7:0] stride (in dwords) // w2 [30:8] offset (signed, in dwords) @@ -641,9 +642,9 @@ fn interpret_vertex_fetch(t: u32) { let w0 = vs_instr_dword(t, 0u); let w1 = vs_instr_dword(t, 1u); let w2 = vs_instr_dword(t, 2u); - let fetch_const = (w0 >> 5u) & 0x1Fu; - let dst_reg = (w0 >> 10u) & 0x7Fu; - let src_reg = (w0 >> 17u) & 0x7Fu; + let fetch_const = (w0 >> 20u) & 0x1Fu; + let dst_reg = (w0 >> 12u) & 0x3Fu; + let src_reg = (w0 >> 5u) & 0x3Fu; let format = (w1 >> 16u) & 0x3Fu; let stride = w2 & 0xFFu; @@ -773,20 +774,20 @@ fn interpret_texture_fetch(t: u32, is_vertex: bool) { } else { w0 = ps_instr_dword(t, 0u); } - let dst_reg = (w0 >> 10u) & 0x7Fu; - let src_reg = (w0 >> 17u) & 0x7Fu; - let uv = registers[src_reg & 0x7Fu].xy; + let dst_reg = (w0 >> 12u) & 0x3Fu; + let src_reg = (w0 >> 5u) & 0x3Fu; + let uv = registers[src_reg & 0x3Fu].xy; let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0); - registers[dst_reg & 0x7Fu] = sample; + registers[dst_reg & 0x3Fu] = sample; } // Walk an Exec clause's instruction triples. -// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag -// (we ignore in MVP); bit 1 = is-fetch. +// sequence: 2-bit-per-instruction bitmap. Bit 0 of a pair = fetch(1)/ALU(0); +// bit 1 = serialize (ignored). (Xenos `ucode.h:226`.) fn exec_vs(address: u32, count: u32, sequence: u32) { for (var i: u32 = 0u; i < count; i = i + 1u) { let t = address + i; - let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u; + let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u; if is_fetch { let opcode = vs_instr_dword(t, 0u) & 0x1Fu; // 0x00 = vertex fetch, 0x01 = texture fetch. @@ -803,7 +804,7 @@ fn exec_vs(address: u32, count: u32, sequence: u32) { fn exec_ps(address: u32, count: u32, sequence: u32) { for (var i: u32 = 0u; i < count; i = i + 1u) { let t = address + i; - let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u; + let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u; if is_fetch { interpret_texture_fetch(t, false); } else { @@ -962,6 +963,9 @@ fn walk_cf_vs() { // No call stack — mark and continue. reject_mask |= REJECT_CF_CALL; } + case CF_KIND_NOP: { + // kNop padding / kMarkVsFetchDone hint — no-op, just advance. + } default: { reject_mask |= REJECT_CF_JUMP; } } if stop { break; } diff --git a/crates/xenia-gpu/src/translator.rs b/crates/xenia-gpu/src/translator.rs index 1655100..afcbc20 100644 --- a/crates/xenia-gpu/src/translator.rs +++ b/crates/xenia-gpu/src/translator.rs @@ -237,6 +237,10 @@ impl EmitCtx { current_alloc = *kind; } ControlFlowInstruction::Exit => break, + // Non-executing CF clauses: padding (`kNop`) and the + // vertex-fetch-done hint (`kMarkVsFetchDone`). Skip them. + ControlFlowInstruction::Nop + | ControlFlowInstruction::MarkVsFetchDone => {} ControlFlowInstruction::LoopStart { .. } | ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP), ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND), @@ -284,7 +288,9 @@ impl EmitCtx { parsed.instructions[base + 1], parsed.instructions[base + 2], ]; - let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0; + // sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0), + // bit[1]=serialize (Xenos `ucode.h:226`). + let is_fetch = ((sequence >> (i * 2)) & 1) != 0; if is_fetch { match decode_fetch(words) { FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?, diff --git a/crates/xenia-gpu/src/ucode/control_flow.rs b/crates/xenia-gpu/src/ucode/control_flow.rs index 941a49d..f8e60dd 100644 --- a/crates/xenia-gpu/src/ucode/control_flow.rs +++ b/crates/xenia-gpu/src/ucode/control_flow.rs @@ -43,7 +43,15 @@ pub enum ControlFlowInstruction { Return, /// `kAlloc` — pre-allocate export registers (position, interpolators, colors). Alloc { size: u32, kind: AllocKind }, - /// Exit the shader (terminal). + /// `kNop` — fills space in the CF block; executes nothing, does not end + /// the shader. (Xenos opcode 0.) + Nop, + /// `kMarkVsFetchDone` — hint that no more vertex fetches will be performed. + /// (Xenos opcode 15.) Non-terminating. + MarkVsFetchDone, + /// Exit the shader (terminal). Synthesized — Xenos has no dedicated exit + /// opcode; the shader ends after an `Exec`/`CondExec` clause with the + /// END bit set (`is_end`). Retained for callers/tests that reference it. Exit, /// Unknown / unhandled opcode. Unknown { opcode: u8 }, @@ -93,37 +101,45 @@ fn decode_single(payload: u64) -> ControlFlowInstruction { let predicated = ((payload >> 28) & 1) != 0; let predicate_condition = ((payload >> 29) & 1) != 0; + // Xenos `ControlFlowOpcode` (canary `ucode.h:86-160`): + // 0 kNop, 1 kExec, 2 kExecEnd, 3 kCondExec, 4 kCondExecEnd, + // 5 kCondExecPred, 6 kCondExecPredEnd, 7 kLoopStart, 8 kLoopEnd, + // 9 kCondCall, 10 kReturn, 11 kCondJmp, 12 kAlloc, + // 13 kCondExecPredClean, 14 kCondExecPredCleanEnd, 15 kMarkVsFetchDone. + // All exec variants share the address(12)/count(3)/sequence(12) layout + // of `ControlFlowExecInstruction`; the `*End` variants terminate the + // shader. (Prior table was off-by-one — it mapped 0→Exec and 1→Exit, + // so a real `kExec` clause was misread as a terminal `Exit`, truncating + // the CF block and dropping every `tfetch` in it.) + let exec = |is_end: bool| ControlFlowInstruction::Exec { + address: (payload & 0xFFF) as u32, + count: ((payload >> 12) & 0x7) as u32, + sequence: ((payload >> 16) & 0xFFF) as u32, + is_end, + predicated, + predicate_condition, + }; match opcode { - 0 => ControlFlowInstruction::Exec { - address: (payload & 0xFFF) as u32, - count: ((payload >> 12) & 0x7) as u32, - sequence: ((payload >> 16) & 0xFFF) as u32, - is_end: false, - predicated, - predicate_condition, - }, - 1 => ControlFlowInstruction::Exit, - 2 => ControlFlowInstruction::Exec { - address: (payload & 0xFFF) as u32, - count: ((payload >> 12) & 0x7) as u32, - sequence: ((payload >> 16) & 0xFFF) as u32, - is_end: true, - predicated, - predicate_condition, - }, - 6 => ControlFlowInstruction::LoopStart { + 0 => ControlFlowInstruction::Nop, + 1 => exec(false), + 2 => exec(true), + 3 => exec(false), + 4 => exec(true), + 5 => exec(false), + 6 => exec(true), + 7 => ControlFlowInstruction::LoopStart { address: (payload & 0x3FF) as u32, loop_id: ((payload >> 16) & 0x1F) as u32, }, - 7 => ControlFlowInstruction::LoopEnd { + 8 => ControlFlowInstruction::LoopEnd { address: (payload & 0x3FF) as u32, loop_id: ((payload >> 16) & 0x1F) as u32, }, - 8 => ControlFlowInstruction::CondCall { + 9 => ControlFlowInstruction::CondCall { target: (payload & 0x3FF) as u32, }, - 9 => ControlFlowInstruction::Return, - 10 => ControlFlowInstruction::CondJmp { + 10 => ControlFlowInstruction::Return, + 11 => ControlFlowInstruction::CondJmp { target: (payload & 0x3FF) as u32, predicated, predicate_condition, @@ -132,6 +148,9 @@ fn decode_single(payload: u64) -> ControlFlowInstruction { size: (payload & 0x7) as u32, kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32), }, + 13 => exec(false), + 14 => exec(true), + 15 => ControlFlowInstruction::MarkVsFetchDone, other => ControlFlowInstruction::Unknown { opcode: other }, } } @@ -141,12 +160,49 @@ mod tests { use super::*; #[test] - fn opcode_exit_decodes() { - // opcode 1 (Exit) in bits 44..47 of A's 48-bit payload. + fn opcode_nop_and_exec_decode() { + // Xenos opcode 0 = kNop (non-terminating padding). + let payload: u64 = 0u64 << 44; + let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32); + assert_eq!(decode_cf_pair(hi, lo, 0).0, ControlFlowInstruction::Nop); + // Xenos opcode 1 = kExec (executes instructions; NOT a terminal exit). let payload: u64 = 1u64 << 44; let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32); - let cf = decode_cf_pair(hi, lo, 0).0; - assert_eq!(cf, ControlFlowInstruction::Exit); + match decode_cf_pair(hi, lo, 0).0 { + ControlFlowInstruction::Exec { is_end, .. } => assert!(!is_end), + other => panic!("opcode 1 should be non-end Exec, got {other:?}"), + } + // Xenos opcode 15 = kMarkVsFetchDone (non-terminating hint). + let payload: u64 = 15u64 << 44; + let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32); + assert_eq!( + decode_cf_pair(hi, lo, 0).0, + ControlFlowInstruction::MarkVsFetchDone + ); + } + + #[test] + fn real_logo_shader_has_tfetch_clauses() { + // The publisher-logo pixel shader E59B2B3DA4AA9008 (captured from the + // canary oracle, byte-identical to the microcode our guest IM_LOADs). + // Regression for iterate-3M: the old off-by-one opcode table decoded + // its leading `kExec` (opcode 1) as a terminal `Exit`, truncating the + // CF block so the `tfetch2D` never appeared → flat splash. + let ucode: [u32; 24] = [ + 0x00011002, 0x00001200, 0xC4000000, 0x00004003, 0x00002200, 0x00000000, + 0x10082021, 0x1F1FF688, 0x00004000, 0xC8080001, 0x001B1B00, 0xC1020000, + 0xC8070000, 0x00C0C000, 0xC1020000, 0xC8070001, 0x00C01B00, 0xC1000100, + 0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000, + ]; + let p = crate::ucode::parse_shader(&ucode); + let exec_clauses = p + .cf + .iter() + .filter(|c| matches!(c, ControlFlowInstruction::Exec { .. })) + .count(); + assert!(exec_clauses >= 1, "expected >=1 Exec clause, cf={:?}", p.cf); + let slots = crate::shader_metrics::tfetch_slots(&p); + assert!(!slots.is_empty(), "expected tfetch slots, got none; cf={:?}", p.cf); } #[test] diff --git a/crates/xenia-gpu/src/ucode/fetch.rs b/crates/xenia-gpu/src/ucode/fetch.rs index 85d1bba..47dbb74 100644 --- a/crates/xenia-gpu/src/ucode/fetch.rs +++ b/crates/xenia-gpu/src/ucode/fetch.rs @@ -54,23 +54,32 @@ pub mod op { } pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction { + // Fetch dword0 bitfields (Xenos `ucode.h:740-749` vfetch / `844-845` + // tfetch): opcode_value:5, src_reg:6, src_reg_am:1, dst_reg:6, + // dst_reg_am:1, (fetch_valid_only|must_be_one):1, const_index:5 @ bit20, + // ... The prior decoder read `const_index` from bit 5 (which is actually + // `src_reg`), so every fetch reported the wrong fetch-constant slot — the + // logo `tfetch2D ..., tf0` was read as `tf1`, and slot 1's empty constant + // failed to decode → no texture. The texture-fetch `dimension` lives in + // dword2 bits 14..15, not dword1. let w0 = words[0]; let w1 = words[1]; + let w2 = words[2]; let opcode = (w0 & 0x1F) as u8; match opcode { op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch { - fetch_const: ((w0 >> 5) & 0x1F) as u8, - src_register: ((w0 >> 17) & 0x7F) as u8, - dest_register: ((w0 >> 10) & 0x7F) as u8, - dest_write_mask: ((w1 >> 23) & 0xF) as u8, + fetch_const: ((w0 >> 20) & 0x1F) as u8, + src_register: ((w0 >> 5) & 0x3F) as u8, + dest_register: ((w0 >> 12) & 0x3F) as u8, + dest_write_mask: (w1 & 0xF) as u8, raw: words, }), op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch { - fetch_const: ((w0 >> 5) & 0x1F) as u8, - src_register: ((w0 >> 17) & 0x7F) as u8, - dest_register: ((w0 >> 10) & 0x7F) as u8, - dest_write_mask: ((w1 >> 23) & 0xF) as u8, - dimension: ((w1 >> 29) & 0x3) as u8, + fetch_const: ((w0 >> 20) & 0x1F) as u8, + src_register: ((w0 >> 5) & 0x3F) as u8, + dest_register: ((w0 >> 12) & 0x3F) as u8, + dest_write_mask: (w1 & 0xF) as u8, + dimension: ((w2 >> 14) & 0x3) as u8, raw: words, }), _ => FetchInstruction::Unknown { opcode, raw: words }, @@ -83,8 +92,9 @@ mod tests { #[test] fn decode_vertex_fetch() { - // opcode=0 (vertex), fetch_const=5, src=2, dest=7. - let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17); + // opcode=0 (vertex). Xenos dword0: src_reg@bit5, dst_reg@bit12, + // const_index@bit20. fetch_const=5, src=2, dest=7. + let w0 = 0u32 | (2 << 5) | (7 << 12) | (5 << 20); let v = decode_fetch([w0, 0, 0]); match v { FetchInstruction::Vertex(vf) => { @@ -98,11 +108,16 @@ mod tests { #[test] fn decode_texture_fetch() { - let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17); - let t = decode_fetch([w0, (2u32 << 29), 0]); + // opcode=1 (texture). const_index@bit20=3, src@bit5=1, dst@bit12=4. + // dimension lives in dword2 bits 14..15. + let w0 = 1u32 | (1 << 5) | (4 << 12) | (3 << 20); + let w2 = 2u32 << 14; + let t = decode_fetch([w0, 0, w2]); match t { FetchInstruction::Texture(tf) => { assert_eq!(tf.fetch_const, 3); + assert_eq!(tf.src_register, 1); + assert_eq!(tf.dest_register, 4); assert_eq!(tf.dimension, 2); } other => panic!("expected Texture, got {other:?}"), diff --git a/crates/xenia-gpu/src/ucode/mod.rs b/crates/xenia-gpu/src/ucode/mod.rs index 70d2349..008e731 100644 --- a/crates/xenia-gpu/src/ucode/mod.rs +++ b/crates/xenia-gpu/src/ucode/mod.rs @@ -48,6 +48,9 @@ pub mod cf_kind { pub const COND_JMP: u32 = 6; pub const COND_CALL: u32 = 7; pub const RETURN: u32 = 8; + /// Non-executing CF clause: `kNop` padding or `kMarkVsFetchDone` hint. + /// The WGSL CF walker treats this as a no-op (advance, do not reject). + pub const NOP: u32 = 9; pub const UNKNOWN: u32 = 15; } @@ -136,6 +139,7 @@ fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) { } CondCall { target } => (cf_kind::COND_CALL, target, 0), Return => (cf_kind::RETURN, 0, 0), + Nop | MarkVsFetchDone => (cf_kind::NOP, 0, 0), Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0), } } @@ -164,9 +168,11 @@ pub struct ParsedShader { } /// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire -/// microcode buffer (control flow + instructions). Heuristic: CF dword count -/// is encoded in the first word's low 12 bits of the last exec clause — -/// canary iterates until it hits a clause of kind `Exit`. We do the same. +/// microcode buffer (control flow + instructions). The CF block is implicitly +/// bounded: we walk clause-pair rows until one terminates the shader (an +/// `Exec`/`CondExec` clause with the END bit set, per Xenos). Everything after +/// that row is the instruction block; exec/loop addresses are then rebased to +/// be relative to it. pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader { let mut cf = Vec::new(); // CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's @@ -175,22 +181,50 @@ pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader { while i + 2 < raw_dwords.len() { let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]); let (first, second) = a; - let seen_exit = matches!( - first, - ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. } - ) || matches!( - second, - ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. } - ); + // The CF block ends after the clause that terminates the shader: an + // `Exec` with the END bit set (Xenos `kExecEnd`/`kCondExec*End`), a + // synthetic `Exit`, or an `Unknown` opcode (decode ran off the CF + // block into instruction data — stop defensively). `Nop` padding + // does NOT terminate. (Previously this stopped on the first `Exit`, + // but with the corrected opcode table opcode 1 is `kExec`, not exit, + // so real exec clauses kept the parse going as intended.) + let terminates = |cf: &ControlFlowInstruction| { + matches!( + cf, + ControlFlowInstruction::Exec { is_end: true, .. } + | ControlFlowInstruction::Exit + | ControlFlowInstruction::Unknown { .. } + ) + }; + let seen_end = terminates(&first) || terminates(&second); cf.push(first); cf.push(second); i += 3; - if seen_exit { + if seen_end { break; } } // Everything after `i` dwords is the instruction block. let instructions = raw_dwords[i..].to_vec(); + // Xenos exec/loop `address` fields are absolute instruction-triple indices + // counted from shader dword 0, but `instructions` here begins *after* the + // CF block. Rebase those addresses to be relative to the instruction block + // (subtract the CF triple count) so `address * 3` indexes `instructions` + // directly. (Without this, every exec read 3 dwords too far per CF triple — + // the publisher-logo `tfetch` triple was skipped → flat splash.) + let cf_triples = (i / 3) as u32; + for clause in cf.iter_mut() { + match clause { + ControlFlowInstruction::Exec { address, .. } => { + *address = address.saturating_sub(cf_triples); + } + ControlFlowInstruction::LoopStart { address, .. } + | ControlFlowInstruction::LoopEnd { address, .. } => { + *address = address.saturating_sub(cf_triples); + } + _ => {} + } + } ParsedShader { cf, instructions } } @@ -235,15 +269,19 @@ mod tests { } #[test] - fn trivial_exit_clause_stops_parsing() { - // Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary. - // Exit clause is opcode 1 in the top 4 bits of the upper 16 bits. - let w0 = 0u32; // clause A body - let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A - let w2 = 0u32; - let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]); + fn exec_end_clause_stops_parsing() { + // Row: clause B = kExecEnd (opcode 2) terminates the CF block. + // 48-bit payload of B occupies hi16(word1) + word2; opcode lives in + // bits 44..47 of that payload. Put opcode 2 there: payload bit 44 set + // for the `2` → (2 << 44). In B's framing, bits 16..47 come from + // word2, so word2 bit (44-16)=28 region holds the opcode nibble. + let b_payload: u64 = 2u64 << 44; // kExecEnd + // B = lo16 from hi16(word1), hi from word2. Reconstruct word1/word2. + let word1 = ((b_payload & 0xFFFF) as u32) << 16; // B's low 16 bits → hi16(word1) + let word2 = ((b_payload >> 16) & 0xFFFF_FFFF) as u32; + let p = parse_shader(&[0, word1, word2, 0xDEAD_BEEF]); assert!(!p.cf.is_empty()); - // Exit detected → remaining dword is instruction data. + // ExecEnd detected in the first row → remaining dword is instruction data. assert_eq!(p.instructions, vec![0xDEAD_BEEF]); } }