fix(gpu): GPUBUG-100 — apply per-operand swizzle + negate to ALU sources
Word-1 of every ALU triple holds three 8-bit component-relative swizzles (`src1_swiz`/`src2_swiz`/`src3_swiz` at bits 16-23/8-15/0-7 per canary ucode.h:2064-2066) and three per-operand negate flags (bits 24/25/26). Pre-fix, both the WGSL interpreter and the AOT translator discarded word-1 entirely with `_ = w1;` — every ALU result was missing its swizzle (broadcast/permute patterns like `.zyxw`, `.xxxx`) and any negated operand was used positive instead. Component-relative semantics (canary's `AluInstruction::GetSwizzledComponentIndex`, ucode.h:1996): for output component i, the source component is `((swizzle >> (2*i)) + i) & 3`. Identity swizzle is 0x00, NOT 0xE4 — the original `apply_swizzle` in the interpreter shader treated it as absolute, also incorrect. Mechanical changes: - crates/xenia-gpu/src/ucode/alu.rs: extend AluInstruction with src_X_swiz (u8) and src_X_negate (bool) fields. decode_alu unpacks them from word 1. - crates/xenia-gpu/src/shaders/xenos_interp.wgsl: apply_swizzle uses component-relative semantics. interpret_alu decodes the modifiers and applies via apply_swizzle + apply_modifiers (with abs=false). - crates/xenia-gpu/src/translator.rs: src_operand emits the precomputed swizzle inline as `vec4<f32>(base.x, base.y, ...)`, then wraps in `(-…)` when negated. Identity swizzle (0x00) emits a bare base expression so it round-trips with the trivial-shader fixture. Abs is omitted in this commit — the abs flag is dual-meaning (for temps it lives at bit 7 of the src byte; for constants at word-2 bit 7 `abs_constants`). Wiring it up correctly requires more careful case-split logic; deferred to Phase G. Verification at -n 100M lockstep: swaps: 2 → 2 (gated by Phase E for draws) draws: 0 → 0 packets: ~58M (within noise) Tests: 554 → 555 (+1 swizzle/negate test, no count change otherwise because identity swizzle test merged into D1's parameterised test). WGSL still validates via naga (combined_module_parses_as_wgsl). Closes GPUBUG-100 (P0). Abs deferred to Phase G. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -129,11 +129,15 @@ fn ps_instr_dword(t: u32, which: u32) -> u32 {
|
||||
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
|
||||
|
||||
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
|
||||
// Swizzle is 8 bits: 2 bits per output lane, xyzw order.
|
||||
let sx = (swizzle >> 0u) & 3u;
|
||||
let sy = (swizzle >> 2u) & 3u;
|
||||
let sz = (swizzle >> 4u) & 3u;
|
||||
let sw = (swizzle >> 6u) & 3u;
|
||||
// Per canary `xenia-canary/src/xenia/gpu/ucode.h:1996`
|
||||
// (`AluInstruction::GetSwizzledComponentIndex`): the 8-bit swizzle is
|
||||
// **component-relative**, not absolute. For output component i, the
|
||||
// selected source component is `((swizzle >> (2*i)) + i) & 3`. Identity
|
||||
// swizzle is therefore 0x00 (all bits zero), NOT 0xE4. GPUBUG-100.
|
||||
let sx = ((swizzle >> 0u) + 0u) & 3u;
|
||||
let sy = ((swizzle >> 2u) + 1u) & 3u;
|
||||
let sz = ((swizzle >> 4u) + 2u) & 3u;
|
||||
let sw = ((swizzle >> 6u) + 3u) & 3u;
|
||||
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
|
||||
}
|
||||
|
||||
@@ -542,9 +546,17 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
||||
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
||||
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
||||
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
||||
// `w1` holds per-operand swizzle + negate/abs flags. Phase D2 decodes
|
||||
// them; Phase D1 only resolved the temp/constant selector.
|
||||
_ = w1;
|
||||
// GPUBUG-100: word-1 holds per-operand swizzle (8 bits each, low to
|
||||
// high = src3/src2/src1 in canary's numbering) + per-operand negate
|
||||
// (bits 24/25/26). Pre-fix `_ = w1;` discarded these → every ALU
|
||||
// result was missing its swizzle (e.g. `.zyxw`, `.xxxx` broadcast)
|
||||
// and any negated operand was used positive instead.
|
||||
let src_a_swiz = w1 & 0xFFu;
|
||||
let src_b_swiz = (w1 >> 8u) & 0xFFu;
|
||||
let src_c_swiz = (w1 >> 16u) & 0xFFu;
|
||||
let src_a_neg = ((w1 >> 24u) & 1u) != 0u;
|
||||
let src_b_neg = ((w1 >> 25u) & 1u) != 0u;
|
||||
let src_c_neg = ((w1 >> 26u) & 1u) != 0u;
|
||||
|
||||
// Honor per-instruction predicate: skip when predicated and the
|
||||
// predicate doesn't match the required condition.
|
||||
@@ -552,10 +564,15 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Vector pipe.
|
||||
let a = read_src(src_a, src_a_is_temp);
|
||||
let b = read_src(src_b, src_b_is_temp);
|
||||
let c = read_src(src_c, src_c_is_temp);
|
||||
// Vector pipe — apply swizzle then negate per operand. Abs is
|
||||
// omitted (the abs flag is dual-meaning and lives at different
|
||||
// bits depending on temp-vs-constant; left for a follow-up).
|
||||
let a_raw = read_src(src_a, src_a_is_temp);
|
||||
let b_raw = read_src(src_b, src_b_is_temp);
|
||||
let c_raw = read_src(src_c, src_c_is_temp);
|
||||
let a = apply_modifiers(apply_swizzle(a_raw, src_a_swiz), src_a_neg, false);
|
||||
let b = apply_modifiers(apply_swizzle(b_raw, src_b_swiz), src_b_neg, false);
|
||||
let c = apply_modifiers(apply_swizzle(c_raw, src_c_swiz), src_c_neg, false);
|
||||
let vec_result = exec_vector_op(vec_op, a, b, c);
|
||||
if vec_wm != 0u {
|
||||
write_reg_masked(vec_dst, vec_wm, vec_result);
|
||||
|
||||
Reference in New Issue
Block a user