Files
xenia-rs/crates/xenia-gpu/src/ucode/control_flow.rs
MechaCat02 6ff184694d [iterate-3P] Real splash geometry in --ui: fix CF predication decode + translator op coverage
Stage 1 of the iterate-3O resume plan: make the P7 translator actually
compile the splash's real VS/PS so real per-vertex POSITIONS render via the
host wgpu pipeline, instead of every draw falling to the interpreter (which
emits a placeholder triangle). Two coupled fixes, both faithful (Route A):

1. ucode/control_flow.rs (GPUBUG-103): clause-level predication was decoded
   from payload bits 28/29, which fall inside the exec clause's `sequence_`/
   `vc_hi_` fields, NOT the predicate flag. That stamped `predicated=true`
   on plain `kExec` clauses, so the translator rejected EVERY splash VS as
   `cf_cond`. Per canary ucode.h, clause predication is determined by the
   *opcode* (only kCondExecPred* = 5/6/13/14 are predicate-register-gated;
   their `condition_` is at word1 bit 9 = payload bit 41). kExec/kExecEnd
   (1/2) run unconditionally; kCondExec (3/4) is bool-constant-gated (not
   modeled). Diagnosed live in --ui: reject reason cf_cond on all 7 splash
   shader pairs → after fix, predicated=false and CF passes.

2. translator.rs: with CF passing, the next reject was `scl_op_unsupported`
   for scalar opcodes 4 (kMulsPrev2 / LIT emul) and 8 (kSgts), plus thin
   vector coverage. Expanded vector_expr + scalar_expr to mirror the runtime
   interpreter's op set (which mirrors canary AluVectorOpcode/AluScalarOpcode):
   CND_EQ/GE/GT, TRUNC, MAX4, DST for vectors; the full SEQS/SGTS/SGES/SNES,
   MULS_PREV2 (with the -FLT_MAX / non-finite / b<=0 guard), SUBS(_PREV),
   EXP/LOG/RCP/RSQ/SQRT/SIN/COS, FRCS/TRUNCS/FLOORS for scalars. Side-effecting
   ops (setp*/kills*/maxas*) still reject → interpreter fallback (honest).

Result (--ui, measured): xlated-pipelines 0→6, all draws served by the
translator (served_interp=0) — real VS/PS now run on the host GPU. The
splash is still not visibly correct because the captured guest vertex
windows read all-zero: the vertex-buffer base VA (~0x0adf_xxxx) is UNMAPPED
in guest memory (mem.translate()==None). That is a CPU/kernel memory-mapping
gap, not a GPU-render gap — the next stage.

Determinism: both files are in xenia-gpu core but the CF `predicated` field
only feeds the UI translator + a metric tag, never deterministic state.
Verified: `check -n50000000 --gpu-inline --stable-digest` matches the golden
byte-for-byte (exit 0); 679 tests green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-18 15:07:06 +02:00

246 lines
10 KiB
Rust

//! Xenos control-flow clause decoder.
//!
//! A shader's CF block is a sequence of 48-bit clauses packed two-per-
//! three-dword row. Each clause encodes an opcode and type-specific fields
//! (exec addr/count, loop start/end, branch target, etc.).
//!
//! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset
//! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`,
//! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the
//! translator can log + degrade.
/// Parsed representation of one CF clause.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ControlFlowInstruction {
/// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions.
Exec {
/// Instruction-block dword index where this clause's instructions start,
/// expressed in **triple units** (each inst = 3 dwords).
address: u32,
/// Number of triples to execute.
count: u32,
/// The ALU-vs-fetch sequence bitmap (2 bits per instruction).
sequence: u32,
/// True when this clause ends the shader.
is_end: bool,
/// True if predicated; skip when predicate != predicate_condition.
predicated: bool,
predicate_condition: bool,
},
/// `kLoopStart` — begin a `aL` loop referencing a loop constant.
LoopStart { address: u32, loop_id: u32 },
/// `kLoopEnd` — close the loop; `address` points at the matching start.
LoopEnd { address: u32, loop_id: u32 },
/// `kCondJmp` — conditional jump to another CF index.
CondJmp {
target: u32,
predicated: bool,
predicate_condition: bool,
},
/// `kCondCall` — call into another CF subroutine.
CondCall { target: u32 },
/// `kReturn` — return from subroutine.
Return,
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
Alloc { size: u32, kind: AllocKind },
/// `kNop` — fills space in the CF block; executes nothing, does not end
/// the shader. (Xenos opcode 0.)
Nop,
/// `kMarkVsFetchDone` — hint that no more vertex fetches will be performed.
/// (Xenos opcode 15.) Non-terminating.
MarkVsFetchDone,
/// Exit the shader (terminal). Synthesized — Xenos has no dedicated exit
/// opcode; the shader ends after an `Exec`/`CondExec` clause with the
/// END bit set (`is_end`). Retained for callers/tests that reference it.
Exit,
/// Unknown / unhandled opcode.
Unknown { opcode: u8 },
}
/// Export target types for `kAlloc` clauses.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AllocKind {
Position,
Interpolators,
Colors,
Memexport,
Other,
}
impl AllocKind {
fn from_bits(b: u32) -> Self {
match b & 0x7 {
0 => AllocKind::Position,
1 => AllocKind::Interpolators,
2 => AllocKind::Colors,
3 => AllocKind::Memexport,
_ => AllocKind::Other,
}
}
}
/// Decode one row (three consecutive CF dwords) into two CF clauses.
///
/// Word layout per canary (`ucode.h:218-256`):
/// - word0 + lo16(word1) → CF_A's 48-bit payload
/// - hi16(word1) + word2 → CF_B's 48-bit payload
///
/// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47).
pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) {
// Build each 48-bit value as u64; LE within the clause.
let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32);
let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16);
(decode_single(a), decode_single(b))
}
fn decode_single(payload: u64) -> ControlFlowInstruction {
// Top 4 bits of the 48-bit payload.
let opcode = ((payload >> 44) & 0xF) as u8;
// GPUBUG-103 (iterate-3P): clause-level predication is determined by the
// *opcode*, not by free bits. The 48-bit CF payload is word0 = bits 0..31,
// word1 = bits 32..47. Per canary `ucode.h`:
// * `ControlFlowExecInstruction` (kExec/kExecEnd, opcodes 1/2): NOT
// predicate-gated — it runs unconditionally.
// * `ControlFlowCondExecInstruction` (kCondExec/kCondExecEnd, 3/4): gated
// by a *bool constant*, `condition_` at word1 bit 10 = payload bit 42.
// We don't model bool-constant gating in the WGSL paths (the bool is
// virtually always set for these), so treat as unconditional.
// * `ControlFlowCondExecPredInstruction` (kCondExecPred/...End/Clean...,
// 5/6/13/14): gated by the *predicate register*; `condition_` at word1
// bit 9 = payload bit 41.
// The prior code read bits 28/29 (which fall inside `sequence_`/`vc_hi_`)
// and stamped `predicated=true` on plenty of plain `kExec` clauses — which
// made the P7 translator reject EVERY splash VS as `cf_cond`, forcing the
// interpreter (placeholder geometry) for all draws.
let is_pred_gated = matches!(opcode, 5 | 6 | 13 | 14);
let predicated = is_pred_gated;
let predicate_condition = is_pred_gated && ((payload >> 41) & 1) != 0;
// Xenos `ControlFlowOpcode` (canary `ucode.h:86-160`):
// 0 kNop, 1 kExec, 2 kExecEnd, 3 kCondExec, 4 kCondExecEnd,
// 5 kCondExecPred, 6 kCondExecPredEnd, 7 kLoopStart, 8 kLoopEnd,
// 9 kCondCall, 10 kReturn, 11 kCondJmp, 12 kAlloc,
// 13 kCondExecPredClean, 14 kCondExecPredCleanEnd, 15 kMarkVsFetchDone.
// All exec variants share the address(12)/count(3)/sequence(12) layout
// of `ControlFlowExecInstruction`; the `*End` variants terminate the
// shader. (Prior table was off-by-one — it mapped 0→Exec and 1→Exit,
// so a real `kExec` clause was misread as a terminal `Exit`, truncating
// the CF block and dropping every `tfetch` in it.)
let exec = |is_end: bool| ControlFlowInstruction::Exec {
address: (payload & 0xFFF) as u32,
count: ((payload >> 12) & 0x7) as u32,
sequence: ((payload >> 16) & 0xFFF) as u32,
is_end,
predicated,
predicate_condition,
};
match opcode {
0 => ControlFlowInstruction::Nop,
1 => exec(false),
2 => exec(true),
3 => exec(false),
4 => exec(true),
5 => exec(false),
6 => exec(true),
7 => ControlFlowInstruction::LoopStart {
address: (payload & 0x3FF) as u32,
loop_id: ((payload >> 16) & 0x1F) as u32,
},
8 => ControlFlowInstruction::LoopEnd {
address: (payload & 0x3FF) as u32,
loop_id: ((payload >> 16) & 0x1F) as u32,
},
9 => ControlFlowInstruction::CondCall {
target: (payload & 0x3FF) as u32,
},
10 => ControlFlowInstruction::Return,
11 => ControlFlowInstruction::CondJmp {
target: (payload & 0x3FF) as u32,
predicated,
predicate_condition,
},
12 => ControlFlowInstruction::Alloc {
size: (payload & 0x7) as u32,
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
},
13 => exec(false),
14 => exec(true),
15 => ControlFlowInstruction::MarkVsFetchDone,
other => ControlFlowInstruction::Unknown { opcode: other },
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn opcode_nop_and_exec_decode() {
// Xenos opcode 0 = kNop (non-terminating padding).
let payload: u64 = 0u64 << 44;
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
assert_eq!(decode_cf_pair(hi, lo, 0).0, ControlFlowInstruction::Nop);
// Xenos opcode 1 = kExec (executes instructions; NOT a terminal exit).
let payload: u64 = 1u64 << 44;
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
match decode_cf_pair(hi, lo, 0).0 {
ControlFlowInstruction::Exec { is_end, .. } => assert!(!is_end),
other => panic!("opcode 1 should be non-end Exec, got {other:?}"),
}
// Xenos opcode 15 = kMarkVsFetchDone (non-terminating hint).
let payload: u64 = 15u64 << 44;
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
assert_eq!(
decode_cf_pair(hi, lo, 0).0,
ControlFlowInstruction::MarkVsFetchDone
);
}
#[test]
fn real_logo_shader_has_tfetch_clauses() {
// The publisher-logo pixel shader E59B2B3DA4AA9008 (captured from the
// canary oracle, byte-identical to the microcode our guest IM_LOADs).
// Regression for iterate-3M: the old off-by-one opcode table decoded
// its leading `kExec` (opcode 1) as a terminal `Exit`, truncating the
// CF block so the `tfetch2D` never appeared → flat splash.
let ucode: [u32; 24] = [
0x00011002, 0x00001200, 0xC4000000, 0x00004003, 0x00002200, 0x00000000,
0x10082021, 0x1F1FF688, 0x00004000, 0xC8080001, 0x001B1B00, 0xC1020000,
0xC8070000, 0x00C0C000, 0xC1020000, 0xC8070001, 0x00C01B00, 0xC1000100,
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
];
let p = crate::ucode::parse_shader(&ucode);
let exec_clauses = p
.cf
.iter()
.filter(|c| matches!(c, ControlFlowInstruction::Exec { .. }))
.count();
assert!(exec_clauses >= 1, "expected >=1 Exec clause, cf={:?}", p.cf);
let slots = crate::shader_metrics::tfetch_slots(&p);
assert!(!slots.is_empty(), "expected tfetch slots, got none; cf={:?}", p.cf);
}
#[test]
fn opcode_exec_end_carries_address_count() {
// opcode 2 (ExecEnd), address=4, count=2, sequence=0.
let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4;
let hi = (payload & 0xFFFF_FFFF) as u32;
let lo = ((payload >> 32) & 0xFFFF) as u32;
let cf = decode_cf_pair(hi, lo, 0).0;
match cf {
ControlFlowInstruction::Exec {
address,
count,
is_end,
..
} => {
assert_eq!(address, 4);
assert_eq!(count, 2);
assert!(is_end);
}
other => panic!("expected Exec, got {other:?}"),
}
}
}