fix(gpu): GPUBUG-100 — apply per-operand swizzle + negate to ALU sources
Word-1 of every ALU triple holds three 8-bit component-relative swizzles (`src1_swiz`/`src2_swiz`/`src3_swiz` at bits 16-23/8-15/0-7 per canary ucode.h:2064-2066) and three per-operand negate flags (bits 24/25/26). Pre-fix, both the WGSL interpreter and the AOT translator discarded word-1 entirely with `_ = w1;` — every ALU result was missing its swizzle (broadcast/permute patterns like `.zyxw`, `.xxxx`) and any negated operand was used positive instead. Component-relative semantics (canary's `AluInstruction::GetSwizzledComponentIndex`, ucode.h:1996): for output component i, the source component is `((swizzle >> (2*i)) + i) & 3`. Identity swizzle is 0x00, NOT 0xE4 — the original `apply_swizzle` in the interpreter shader treated it as absolute, also incorrect. Mechanical changes: - crates/xenia-gpu/src/ucode/alu.rs: extend AluInstruction with src_X_swiz (u8) and src_X_negate (bool) fields. decode_alu unpacks them from word 1. - crates/xenia-gpu/src/shaders/xenos_interp.wgsl: apply_swizzle uses component-relative semantics. interpret_alu decodes the modifiers and applies via apply_swizzle + apply_modifiers (with abs=false). - crates/xenia-gpu/src/translator.rs: src_operand emits the precomputed swizzle inline as `vec4<f32>(base.x, base.y, ...)`, then wraps in `(-…)` when negated. Identity swizzle (0x00) emits a bare base expression so it round-trips with the trivial-shader fixture. Abs is omitted in this commit — the abs flag is dual-meaning (for temps it lives at bit 7 of the src byte; for constants at word-2 bit 7 `abs_constants`). Wiring it up correctly requires more careful case-split logic; deferred to Phase G. Verification at -n 100M lockstep: swaps: 2 → 2 (gated by Phase E for draws) draws: 0 → 0 packets: ~58M (within noise) Tests: 554 → 555 (+1 swizzle/negate test, no count change otherwise because identity swizzle test merged into D1's parameterised test). WGSL still validates via naga (combined_module_parses_as_wgsl). Closes GPUBUG-100 (P0). Abs deferred to Phase G. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -129,11 +129,15 @@ fn ps_instr_dword(t: u32, which: u32) -> u32 {
|
|||||||
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
|
// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
|
||||||
|
|
||||||
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
|
fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
|
||||||
// Swizzle is 8 bits: 2 bits per output lane, xyzw order.
|
// Per canary `xenia-canary/src/xenia/gpu/ucode.h:1996`
|
||||||
let sx = (swizzle >> 0u) & 3u;
|
// (`AluInstruction::GetSwizzledComponentIndex`): the 8-bit swizzle is
|
||||||
let sy = (swizzle >> 2u) & 3u;
|
// **component-relative**, not absolute. For output component i, the
|
||||||
let sz = (swizzle >> 4u) & 3u;
|
// selected source component is `((swizzle >> (2*i)) + i) & 3`. Identity
|
||||||
let sw = (swizzle >> 6u) & 3u;
|
// swizzle is therefore 0x00 (all bits zero), NOT 0xE4. GPUBUG-100.
|
||||||
|
let sx = ((swizzle >> 0u) + 0u) & 3u;
|
||||||
|
let sy = ((swizzle >> 2u) + 1u) & 3u;
|
||||||
|
let sz = ((swizzle >> 4u) + 2u) & 3u;
|
||||||
|
let sw = ((swizzle >> 6u) + 3u) & 3u;
|
||||||
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
|
return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -542,9 +546,17 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
|||||||
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
let predicated = ((w0 >> 27u) & 1u) != 0u;
|
||||||
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
let predicate_condition = ((w0 >> 28u) & 1u) != 0u;
|
||||||
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
let scalar_src_is_ps = ((w0 >> 26u) & 1u) != 0u;
|
||||||
// `w1` holds per-operand swizzle + negate/abs flags. Phase D2 decodes
|
// GPUBUG-100: word-1 holds per-operand swizzle (8 bits each, low to
|
||||||
// them; Phase D1 only resolved the temp/constant selector.
|
// high = src3/src2/src1 in canary's numbering) + per-operand negate
|
||||||
_ = w1;
|
// (bits 24/25/26). Pre-fix `_ = w1;` discarded these → every ALU
|
||||||
|
// result was missing its swizzle (e.g. `.zyxw`, `.xxxx` broadcast)
|
||||||
|
// and any negated operand was used positive instead.
|
||||||
|
let src_a_swiz = w1 & 0xFFu;
|
||||||
|
let src_b_swiz = (w1 >> 8u) & 0xFFu;
|
||||||
|
let src_c_swiz = (w1 >> 16u) & 0xFFu;
|
||||||
|
let src_a_neg = ((w1 >> 24u) & 1u) != 0u;
|
||||||
|
let src_b_neg = ((w1 >> 25u) & 1u) != 0u;
|
||||||
|
let src_c_neg = ((w1 >> 26u) & 1u) != 0u;
|
||||||
|
|
||||||
// Honor per-instruction predicate: skip when predicated and the
|
// Honor per-instruction predicate: skip when predicated and the
|
||||||
// predicate doesn't match the required condition.
|
// predicate doesn't match the required condition.
|
||||||
@@ -552,10 +564,15 @@ fn interpret_alu(t: u32, is_vertex: bool) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vector pipe.
|
// Vector pipe — apply swizzle then negate per operand. Abs is
|
||||||
let a = read_src(src_a, src_a_is_temp);
|
// omitted (the abs flag is dual-meaning and lives at different
|
||||||
let b = read_src(src_b, src_b_is_temp);
|
// bits depending on temp-vs-constant; left for a follow-up).
|
||||||
let c = read_src(src_c, src_c_is_temp);
|
let a_raw = read_src(src_a, src_a_is_temp);
|
||||||
|
let b_raw = read_src(src_b, src_b_is_temp);
|
||||||
|
let c_raw = read_src(src_c, src_c_is_temp);
|
||||||
|
let a = apply_modifiers(apply_swizzle(a_raw, src_a_swiz), src_a_neg, false);
|
||||||
|
let b = apply_modifiers(apply_swizzle(b_raw, src_b_swiz), src_b_neg, false);
|
||||||
|
let c = apply_modifiers(apply_swizzle(c_raw, src_c_swiz), src_c_neg, false);
|
||||||
let vec_result = exec_vector_op(vec_op, a, b, c);
|
let vec_result = exec_vector_op(vec_op, a, b, c);
|
||||||
if vec_wm != 0u {
|
if vec_wm != 0u {
|
||||||
write_reg_masked(vec_dst, vec_wm, vec_result);
|
write_reg_masked(vec_dst, vec_wm, vec_result);
|
||||||
|
|||||||
@@ -287,15 +287,15 @@ impl EmitCtx {
|
|||||||
alu: &AluInstruction,
|
alu: &AluInstruction,
|
||||||
current_alloc: AllocKind,
|
current_alloc: AllocKind,
|
||||||
) -> Result<(), &'static str> {
|
) -> Result<(), &'static str> {
|
||||||
// GPUBUG-101: word-0 bits 29-31 select temp vs constant for
|
// GPUBUG-100/101: per-operand temp-vs-constant selector (w0
|
||||||
// each operand (canary `srcN_sel`); the corresponding src byte
|
// bits 29-31), 8-bit component-relative swizzle (w1 bytes 0-2),
|
||||||
// indexes either a general register (temp) or an ALU constant
|
// and 1-bit negate (w1 bits 24-26). Pre-fix all three were
|
||||||
// (c#). Pre-fix the translator unconditionally emitted r[low7]
|
// discarded, so every ALU read came back as r[low7] without
|
||||||
// for both, dropping every shader's uniform read. Mirrors the
|
// any swizzle / negation, dropping every shader's uniforms +
|
||||||
// interpreter's `read_src`.
|
// negative operands.
|
||||||
let a = src_operand(alu.src_a, alu.src_a_is_temp);
|
let a = src_operand(alu.src_a, alu.src_a_is_temp, alu.src_a_swiz, alu.src_a_negate);
|
||||||
let b = src_operand(alu.src_b, alu.src_b_is_temp);
|
let b = src_operand(alu.src_b, alu.src_b_is_temp, alu.src_b_swiz, alu.src_b_negate);
|
||||||
let c = src_operand(alu.src_c, alu.src_c_is_temp);
|
let c = src_operand(alu.src_c, alu.src_c_is_temp, alu.src_c_swiz, alu.src_c_negate);
|
||||||
|
|
||||||
// Vector pipe.
|
// Vector pipe.
|
||||||
if alu.vector_write_mask != 0 {
|
if alu.vector_write_mask != 0 {
|
||||||
@@ -409,17 +409,38 @@ impl EmitCtx {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Emit the WGSL expression that reads an ALU source operand. Per
|
/// Emit the WGSL expression that reads an ALU source operand with
|
||||||
/// canary `xenia-canary/src/xenia/gpu/ucode.h`, the temp-vs-constant
|
/// swizzle + negate applied (no abs — see GPUBUG-100 deferred). Mirrors
|
||||||
/// selector is in word-0 bits 29-31 (`srcN_sel`); the src byte is then
|
/// the interpreter shader's `read_src` + `apply_swizzle` + the negate
|
||||||
/// either a temp descriptor (bits 5:0 = temp index, bits 6/7 reserved
|
/// half of `apply_modifiers`. The 8-bit `swizzle` is component-relative
|
||||||
/// for relative/abs flags decoded in Phase D2) or a flat constant
|
/// per canary `AluInstruction::GetSwizzledComponentIndex`: for output
|
||||||
/// index. Mirrors the interpreter shader's `read_src`. GPUBUG-101.
|
/// component i, source component is `((swiz >> (2*i)) + i) & 3`.
|
||||||
fn src_operand(src_byte: u8, is_temp: bool) -> String {
|
/// Identity swizzle is `0x00`. GPUBUG-100 / GPUBUG-101.
|
||||||
if is_temp {
|
fn src_operand(src_byte: u8, is_temp: bool, swizzle: u8, negate: bool) -> String {
|
||||||
|
let base = if is_temp {
|
||||||
format!("r[{}u]", (src_byte & 0x3F) as u32)
|
format!("r[{}u]", (src_byte & 0x3F) as u32)
|
||||||
} else {
|
} else {
|
||||||
format!("xenos_consts.alu[{}u]", src_byte as u32)
|
format!("xenos_consts.alu[{}u]", src_byte as u32)
|
||||||
|
};
|
||||||
|
let s = swizzle as u32;
|
||||||
|
let lane = |i: u32| -> char {
|
||||||
|
let c = (((s >> (2 * i)) + i) & 3) as usize;
|
||||||
|
['x', 'y', 'z', 'w'][c]
|
||||||
|
};
|
||||||
|
// Identity swizzle (0x00) maps to .xyzw — emit a bare expression.
|
||||||
|
let swizzled = if swizzle == 0 {
|
||||||
|
base
|
||||||
|
} else {
|
||||||
|
let lx = lane(0);
|
||||||
|
let ly = lane(1);
|
||||||
|
let lz = lane(2);
|
||||||
|
let lw = lane(3);
|
||||||
|
format!("vec4<f32>({base}.{lx}, {base}.{ly}, {base}.{lz}, {base}.{lw})")
|
||||||
|
};
|
||||||
|
if negate {
|
||||||
|
format!("(-{swizzled})")
|
||||||
|
} else {
|
||||||
|
swizzled
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -542,19 +563,49 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn src_operand_decodes_temp_vs_constant() {
|
fn src_operand_decodes_temp_vs_constant_no_modifiers() {
|
||||||
// GPUBUG-101: is_temp=true → r[low6]; is_temp=false → xenos_consts.alu[full].
|
// GPUBUG-101: is_temp=true → r[low6]; is_temp=false → xenos_consts.alu[full].
|
||||||
assert_eq!(src_operand(0x00, true), "r[0u]");
|
// Identity swizzle (0x00), no negate → bare base expression.
|
||||||
assert_eq!(src_operand(0x05, true), "r[5u]");
|
assert_eq!(src_operand(0x00, true, 0x00, false), "r[0u]");
|
||||||
assert_eq!(src_operand(0x3F, true), "r[63u]");
|
assert_eq!(src_operand(0x05, true, 0x00, false), "r[5u]");
|
||||||
|
assert_eq!(src_operand(0x3F, true, 0x00, false), "r[63u]");
|
||||||
// For temps, bits 6/7 are reserved (abs/rel) — they don't widen
|
// For temps, bits 6/7 are reserved (abs/rel) — they don't widen
|
||||||
// the register index even if set. Phase D2 will consume them.
|
// the register index even if set. Phase D2 will consume them.
|
||||||
assert_eq!(src_operand(0x80, true), "r[0u]");
|
assert_eq!(src_operand(0x80, true, 0x00, false), "r[0u]");
|
||||||
assert_eq!(src_operand(0xFF, true), "r[63u]");
|
assert_eq!(src_operand(0xFF, true, 0x00, false), "r[63u]");
|
||||||
// Constants: full 8-bit index.
|
// Constants: full 8-bit index.
|
||||||
assert_eq!(src_operand(0x00, false), "xenos_consts.alu[0u]");
|
assert_eq!(src_operand(0x00, false, 0x00, false), "xenos_consts.alu[0u]");
|
||||||
assert_eq!(src_operand(0x05, false), "xenos_consts.alu[5u]");
|
assert_eq!(src_operand(0x05, false, 0x00, false), "xenos_consts.alu[5u]");
|
||||||
assert_eq!(src_operand(0xFF, false), "xenos_consts.alu[255u]");
|
assert_eq!(src_operand(0xFF, false, 0x00, false), "xenos_consts.alu[255u]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn src_operand_applies_swizzle_and_negate() {
|
||||||
|
// GPUBUG-100. Component-relative swizzle. swizzle=0x1B reverses
|
||||||
|
// the lanes (.wzyx): for i=0 → ((0x1B >> 0) + 0) & 3 = 3 = w;
|
||||||
|
// for i=1 → ((0x1B >> 2) + 1) & 3 = (6+1)&3 = 3 = w. Hmm —
|
||||||
|
// canary's identity is 0x00 = .xyzw, so .wzyx in component-
|
||||||
|
// relative terms = `s0=3, s1=2, s2=1, s3=0` → bits would be
|
||||||
|
// (3, (2-1)&3=1, (1-2)&3=3, (0-3)&3=1) which combines weirdly.
|
||||||
|
// We just verify the mechanics by precomputing a known case:
|
||||||
|
// swizzle=0x00 (identity) outputs .xyzw — matched by no-swizzle
|
||||||
|
// branch. Negate wraps in `(-…)`.
|
||||||
|
assert_eq!(src_operand(0x05, true, 0x00, true), "(-r[5u])");
|
||||||
|
// swizzle=0xFF → for each i, ((0xFF >> (2i)) + i) & 3:
|
||||||
|
// i=0: (3 + 0) & 3 = 3 → w
|
||||||
|
// i=1: ((0x3F) + 1) & 3 = (63+1)&3 = 0 → x
|
||||||
|
// i=2: ((0x0F) + 2) & 3 = (15+2)&3 = 1 → y
|
||||||
|
// i=3: ((0x03) + 3) & 3 = (3+3)&3 = 2 → z
|
||||||
|
// Output: .wxyz
|
||||||
|
assert_eq!(
|
||||||
|
src_operand(0x05, true, 0xFF, false),
|
||||||
|
"vec4<f32>(r[5u].w, r[5u].x, r[5u].y, r[5u].z)"
|
||||||
|
);
|
||||||
|
// Combined: negate of constant with .wxyz swizzle.
|
||||||
|
assert_eq!(
|
||||||
|
src_operand(0x07, false, 0xFF, true),
|
||||||
|
"(-vec4<f32>(xenos_consts.alu[7u].w, xenos_consts.alu[7u].x, xenos_consts.alu[7u].y, xenos_consts.alu[7u].z))"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -45,6 +45,18 @@ pub struct AluInstruction {
|
|||||||
pub src_a_is_temp: bool,
|
pub src_a_is_temp: bool,
|
||||||
pub src_b_is_temp: bool,
|
pub src_b_is_temp: bool,
|
||||||
pub src_c_is_temp: bool,
|
pub src_c_is_temp: bool,
|
||||||
|
/// Per-operand 8-bit component-relative swizzle (canary's
|
||||||
|
/// `srcN_swiz`, ucode.h:2064-2066). For output component i, the
|
||||||
|
/// selected source component is `((swizzle >> (2*i)) + i) & 3`.
|
||||||
|
/// Identity swizzle is `0x00`. GPUBUG-100.
|
||||||
|
pub src_a_swiz: u8,
|
||||||
|
pub src_b_swiz: u8,
|
||||||
|
pub src_c_swiz: u8,
|
||||||
|
/// Per-operand negate flags (canary's `srcN_reg_negate`, w1 bits
|
||||||
|
/// 24/25/26). Applied after the swizzle. GPUBUG-100.
|
||||||
|
pub src_a_negate: bool,
|
||||||
|
pub src_b_negate: bool,
|
||||||
|
pub src_c_negate: bool,
|
||||||
/// Set when the instruction is predicated; skipped if the predicate
|
/// Set when the instruction is predicated; skipped if the predicate
|
||||||
/// doesn't match `predicate_condition`.
|
/// doesn't match `predicate_condition`.
|
||||||
pub predicated: bool,
|
pub predicated: bool,
|
||||||
@@ -57,7 +69,7 @@ pub struct AluInstruction {
|
|||||||
/// Decode a 3-dword ALU triple.
|
/// Decode a 3-dword ALU triple.
|
||||||
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
||||||
let w0 = words[0];
|
let w0 = words[0];
|
||||||
let _w1 = words[1];
|
let w1 = words[1];
|
||||||
let w2 = words[2];
|
let w2 = words[2];
|
||||||
AluInstruction {
|
AluInstruction {
|
||||||
vector_opcode: (w2 & 0x3F) as u8,
|
vector_opcode: (w2 & 0x3F) as u8,
|
||||||
@@ -78,6 +90,12 @@ pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
|
|||||||
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
src_a_is_temp: ((w0 >> 29) & 1) != 0,
|
||||||
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
src_b_is_temp: ((w0 >> 30) & 1) != 0,
|
||||||
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
src_c_is_temp: ((w0 >> 31) & 1) != 0,
|
||||||
|
src_a_swiz: (w1 & 0xFF) as u8,
|
||||||
|
src_b_swiz: ((w1 >> 8) & 0xFF) as u8,
|
||||||
|
src_c_swiz: ((w1 >> 16) & 0xFF) as u8,
|
||||||
|
src_a_negate: ((w1 >> 24) & 1) != 0,
|
||||||
|
src_b_negate: ((w1 >> 25) & 1) != 0,
|
||||||
|
src_c_negate: ((w1 >> 26) & 1) != 0,
|
||||||
predicated: ((w0 >> 27) & 1) != 0,
|
predicated: ((w0 >> 27) & 1) != 0,
|
||||||
predicate_condition: ((w0 >> 28) & 1) != 0,
|
predicate_condition: ((w0 >> 28) & 1) != 0,
|
||||||
raw: words,
|
raw: words,
|
||||||
|
|||||||
Reference in New Issue
Block a user