From 6ff184694d853c5457996196f226a80222013a97 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Thu, 18 Jun 2026 15:07:06 +0200 Subject: [PATCH] [iterate-3P] Real splash geometry in --ui: fix CF predication decode + translator op coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 1 of the iterate-3O resume plan: make the P7 translator actually compile the splash's real VS/PS so real per-vertex POSITIONS render via the host wgpu pipeline, instead of every draw falling to the interpreter (which emits a placeholder triangle). Two coupled fixes, both faithful (Route A): 1. ucode/control_flow.rs (GPUBUG-103): clause-level predication was decoded from payload bits 28/29, which fall inside the exec clause's `sequence_`/ `vc_hi_` fields, NOT the predicate flag. That stamped `predicated=true` on plain `kExec` clauses, so the translator rejected EVERY splash VS as `cf_cond`. Per canary ucode.h, clause predication is determined by the *opcode* (only kCondExecPred* = 5/6/13/14 are predicate-register-gated; their `condition_` is at word1 bit 9 = payload bit 41). kExec/kExecEnd (1/2) run unconditionally; kCondExec (3/4) is bool-constant-gated (not modeled). Diagnosed live in --ui: reject reason cf_cond on all 7 splash shader pairs → after fix, predicated=false and CF passes. 2. translator.rs: with CF passing, the next reject was `scl_op_unsupported` for scalar opcodes 4 (kMulsPrev2 / LIT emul) and 8 (kSgts), plus thin vector coverage. Expanded vector_expr + scalar_expr to mirror the runtime interpreter's op set (which mirrors canary AluVectorOpcode/AluScalarOpcode): CND_EQ/GE/GT, TRUNC, MAX4, DST for vectors; the full SEQS/SGTS/SGES/SNES, MULS_PREV2 (with the -FLT_MAX / non-finite / b<=0 guard), SUBS(_PREV), EXP/LOG/RCP/RSQ/SQRT/SIN/COS, FRCS/TRUNCS/FLOORS for scalars. Side-effecting ops (setp*/kills*/maxas*) still reject → interpreter fallback (honest). Result (--ui, measured): xlated-pipelines 0→6, all draws served by the translator (served_interp=0) — real VS/PS now run on the host GPU. The splash is still not visibly correct because the captured guest vertex windows read all-zero: the vertex-buffer base VA (~0x0adf_xxxx) is UNMAPPED in guest memory (mem.translate()==None). That is a CPU/kernel memory-mapping gap, not a GPU-render gap — the next stage. Determinism: both files are in xenia-gpu core but the CF `predicated` field only feeds the UI translator + a metric tag, never deterministic state. Verified: `check -n50000000 --gpu-inline --stable-digest` matches the golden byte-for-byte (exit 0); 679 tests green. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/xenia-gpu/src/translator.rs | 74 +++++++++++++++++----- crates/xenia-gpu/src/ucode/control_flow.rs | 24 +++++-- 2 files changed, 78 insertions(+), 20 deletions(-) diff --git a/crates/xenia-gpu/src/translator.rs b/crates/xenia-gpu/src/translator.rs index 843c219..1bb9902 100644 --- a/crates/xenia-gpu/src/translator.rs +++ b/crates/xenia-gpu/src/translator.rs @@ -485,6 +485,22 @@ fn src_operand(src_byte: u8, is_temp: bool, swizzle: u8, negate: bool) -> String } fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option { + // Semantics mirror the runtime interpreter's `exec_vector_op` + // (`shaders/xenos_interp.wgsl`), which in turn mirrors canary's + // `AluVectorOpcode` (ucode.h:1001+). Side-effecting ops (kill*, setp_push) + // need per-invocation state the AOT emitter doesn't track yet → still + // `None` (interpreter fallback). + let cmp4 = |op: &str| { + format!( + "vec4(select(0.0,1.0,{a}.x{op}{b}.x), select(0.0,1.0,{a}.y{op}{b}.y), select(0.0,1.0,{a}.z{op}{b}.z), select(0.0,1.0,{a}.w{op}{b}.w))" + ) + }; + // CND* : per-lane select(c, b, a 0). + let cnd4 = |op: &str| { + format!( + "vec4(select({c}.x,{b}.x,{a}.x{op}0.0), select({c}.y,{b}.y,{a}.y{op}0.0), select({c}.z,{b}.z,{a}.z{op}0.0), select({c}.w,{b}.w,{a}.w{op}0.0))" + ) + }; let s = match op { vop::ADD => format!("({a} + {b})"), vop::MUL => format!("({a} * {b})"), @@ -493,37 +509,63 @@ fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option { vop::MAD => format!("({a} * {b} + {c})"), vop::DOT4 => format!("vec4(dot({a}, {b}))"), vop::DOT3 => format!("vec4(dot({a}.xyz, {b}.xyz))"), - vop::DOT2_ADD => format!( - "vec4({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)" - ), - vop::SEQ => format!( - "vec4(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))" - ), - vop::SGT => format!( - "vec4(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))" - ), - vop::SGE => format!( - "vec4(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))" - ), - vop::SNE => format!( - "vec4(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))" - ), + vop::DOT2_ADD => format!("vec4({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"), + vop::SEQ => cmp4("=="), + vop::SGT => cmp4(">"), + vop::SGE => cmp4(">="), + vop::SNE => cmp4("!="), + vop::CND_EQ => cnd4("=="), + vop::CND_GE => cnd4(">="), + vop::CND_GT => cnd4(">"), vop::FRC => format!("fract({a})"), + vop::TRUNC => format!("trunc({a})"), vop::FLOOR => format!("floor({a})"), + vop::MAX4 => format!("vec4(max(max({a}.x,{a}.y), max({a}.z,{a}.w)))"), + // dst = (1, src0.y*src1.y, src0.z, src1.w) (canary kDst) + vop::DST => format!("vec4(1.0, {a}.y * {b}.y, {a}.z, {b}.w)"), _ => return None, }; Some(s) } fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option { + // Semantics mirror the runtime interpreter's `exec_scalar_op` + // (`shaders/xenos_interp.wgsl`) / canary's `AluScalarOpcode` + // (ucode.h:1001+). Side-effecting ops (setp*, kills*, maxas*) need + // per-invocation predicate/kill/address state the AOT emitter doesn't + // track yet → still `None` (interpreter fallback). let s = match op { sop::ADDS => format!("({a} + {b})"), sop::ADDS_PREV => format!("({a} + {prev})"), sop::MULS => format!("({a} * {b})"), sop::MULS_PREV => format!("({a} * {prev})"), + // muls_prev2 / LIT emulation (canary kMulsPrev2): guard against + // -FLT_MAX / non-finite ps & b, and b <= 0. + sop::MULS_PREV2 => format!( + "select({a} * {prev}, -3.4028235e38, {prev} == -3.4028235e38 || !(\ + {prev} == {prev}) || abs({prev}) > 3.4028235e38 || !({b} == {b}) || \ + abs({b}) > 3.4028235e38 || {b} <= 0.0)" + ), sop::MAXS => format!("max({a}, {b})"), sop::MINS => format!("min({a}, {b})"), - sop::RCP => format!("xe_rcp({a})"), + sop::SEQS => format!("select(0.0, 1.0, {a} == 0.0)"), + sop::SGTS => format!("select(0.0, 1.0, {a} > 0.0)"), + sop::SGES => format!("select(0.0, 1.0, {a} >= 0.0)"), + sop::SNES => format!("select(0.0, 1.0, {a} != 0.0)"), + sop::FRCS => format!("fract({a})"), + sop::TRUNCS => format!("trunc({a})"), + sop::FLOORS => format!("floor({a})"), + sop::SUBS => format!("({a} - {b})"), + sop::SUBS_PREV => format!("({a} - {prev})"), + sop::EXP => format!("exp2({a})"), + sop::LOG | sop::LOGC => format!("select(log2({a}), 0.0, {a} == 1.0)"), + sop::RCP | sop::RCPC | sop::RCPF => format!("xe_rcp({a})"), + sop::RSQ | sop::RSQC | sop::RSQF => { + format!("select(0.0, inverseSqrt({a}), {a} > 0.0)") + } + sop::SQRT => format!("select(0.0, sqrt({a}), {a} >= 0.0)"), + sop::SIN => format!("sin({a})"), + sop::COS => format!("cos({a})"), sop::RETAIN_PREV => prev.to_string(), _ => return None, }; diff --git a/crates/xenia-gpu/src/ucode/control_flow.rs b/crates/xenia-gpu/src/ucode/control_flow.rs index f8e60dd..966609a 100644 --- a/crates/xenia-gpu/src/ucode/control_flow.rs +++ b/crates/xenia-gpu/src/ucode/control_flow.rs @@ -96,10 +96,26 @@ pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruc fn decode_single(payload: u64) -> ControlFlowInstruction { // Top 4 bits of the 48-bit payload. let opcode = ((payload >> 44) & 0xF) as u8; - // Predicate bit + condition live at the 28..30 range for exec/jmp. Rough - // extraction — good enough for the interpreter, which logs unknowns. - let predicated = ((payload >> 28) & 1) != 0; - let predicate_condition = ((payload >> 29) & 1) != 0; + + // GPUBUG-103 (iterate-3P): clause-level predication is determined by the + // *opcode*, not by free bits. The 48-bit CF payload is word0 = bits 0..31, + // word1 = bits 32..47. Per canary `ucode.h`: + // * `ControlFlowExecInstruction` (kExec/kExecEnd, opcodes 1/2): NOT + // predicate-gated — it runs unconditionally. + // * `ControlFlowCondExecInstruction` (kCondExec/kCondExecEnd, 3/4): gated + // by a *bool constant*, `condition_` at word1 bit 10 = payload bit 42. + // We don't model bool-constant gating in the WGSL paths (the bool is + // virtually always set for these), so treat as unconditional. + // * `ControlFlowCondExecPredInstruction` (kCondExecPred/...End/Clean..., + // 5/6/13/14): gated by the *predicate register*; `condition_` at word1 + // bit 9 = payload bit 41. + // The prior code read bits 28/29 (which fall inside `sequence_`/`vc_hi_`) + // and stamped `predicated=true` on plenty of plain `kExec` clauses — which + // made the P7 translator reject EVERY splash VS as `cf_cond`, forcing the + // interpreter (placeholder geometry) for all draws. + let is_pred_gated = matches!(opcode, 5 | 6 | 13 | 14); + let predicated = is_pred_gated; + let predicate_condition = is_pred_gated && ((payload >> 41) & 1) != 0; // Xenos `ControlFlowOpcode` (canary `ucode.h:86-160`): // 0 kNop, 1 kExec, 2 kExecEnd, 3 kCondExec, 4 kCondExecEnd,