fix(cpu): PPCBUG-424+425 vmaddfp128/vmaddcfp128 operand swap + va128 field fix
PPCBUG-424: vmaddfp128 computed VA×VB+VD instead of ISA-mandated VA×VD+VB. PPCBUG-425: vmaddcfp128 computed VD×VB+VA instead of ISA-mandated VA×VD+VB. Root-cause discovered while writing the operand-order regression tests: va128() was extracting PPC bits 6-10 (the same field as vd128's low 5 bits), not PPC bits 11-15 where VA lives in VX128 form. This meant va128() silently aliased vd128 for any instruction where VA != VD, making the operand swap invisible in the existing denorm-flush test (which used VA == VD == v2). Fixes in this commit: - decoder.rs: va128() now extracts PPC bits 11-15 (host bits 20-16) + bit29. The vmx128_va128_uses_bit29 test encoding updated to match the correct field. - interpreter.rs: vmaddfp128 changed from ai.mul_add(bi,di) to ai.mul_add(di,bi) (VA×VD+VB). vmaddcfp128 changed from di.mul_add(bi,ai) to ai.mul_add(di,bi). vmaddfp128_flushes_denormal_inputs redesigned with distinct VA/VD/VB registers (v1/v2/v3) so the flush test is independent of the accessor fix. New vmaddfp128_operand_order_va_times_vd_plus_vb and vmaddcfp128_operand_order_va_times_vd_plus_vb tests verify 2×3+10=16. - disasm_goldens.rs + vmx128_registers.json: vmaddfp128/vmaddcfp128/vnmsubfp128 golden raws updated to properly encode VA at PPC bits 11-15 (new raws: 0x146328D4 / 0x14632914 / 0x14632954). vperm128 / vsrw128 golden operands updated to reflect correct VA extraction (v4 instead of v3/v0). Affects all VMX128 binary ops that call va128(): vaddfp128, vsubfp128, vmulfp128, vmaddfp128, vmaddcfp128, vnmsubfp128, vperm128, vsrw128 etc. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1919,11 +1919,10 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::vmaddfp128 => {
|
||||
// VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator;
|
||||
// Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through
|
||||
// `InstrEmit_vmaddfp_` with arg order swapped so the resulting
|
||||
// HIR computation is `VA * VB + VD`). Same unconditional denorm
|
||||
// flush of all three inputs as scalar `vmaddfp`.
|
||||
// ISA: (VD) <- (VA × VD) + VB. VD is both the second multiplicand and destination.
|
||||
// Canary InstrEmit_vmaddfp128 (ppc_emit_altivec.cc:806-809): MulAdd(VA, VD, VB).
|
||||
// Previous code computed ai.mul_add(bi, di) = VA×VB+VD — VB and VD roles swapped
|
||||
// (PPCBUG-424). Fix: ai.mul_add(di, bi) = VA×VD+VB.
|
||||
let a = ctx.vr[instr.va128()].as_f32x4();
|
||||
let b = ctx.vr[instr.vb128()].as_f32x4();
|
||||
let d = ctx.vr[instr.vd128()].as_f32x4();
|
||||
@@ -1932,7 +1931,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
let di = vmx::flush_denorm(d[i]);
|
||||
r[i] = ai.mul_add(bi, di);
|
||||
r[i] = ai.mul_add(di, bi);
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
@@ -4297,11 +4296,11 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
// ═════════════════════════════════════════════════════════════════
|
||||
// §4j — VMX128 FMA / permute
|
||||
// ═════════════════════════════════════════════════════════════════
|
||||
// vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator)
|
||||
// vmaddcfp128: ISA (VD) <- (VA × VD) + VB — same operation as vmaddfp128
|
||||
PpcOpcode::vmaddcfp128 => {
|
||||
// Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both
|
||||
// a source operand (as multiplicand) and the destination. Canary &
|
||||
// POWER8 hardware confirm denormal inputs are flushed regardless of NJ.
|
||||
// ISA: (VD) <- (VA × VD) + VB. Canary InstrEmit_vmaddcfp128 (cc:819): MulAdd(VA, VD, VB).
|
||||
// Previous code computed di.mul_add(bi, ai) = VD×VB+VA — both operands wrong
|
||||
// (PPCBUG-425). Fix: ai.mul_add(di, bi) = VA×VD+VB.
|
||||
let a = ctx.vr[instr.va128()].as_f32x4();
|
||||
let b = ctx.vr[instr.vb128()].as_f32x4();
|
||||
let d = ctx.vr[instr.vd128()].as_f32x4();
|
||||
@@ -4310,7 +4309,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
let di = vmx::flush_denorm(d[i]);
|
||||
r[i] = di.mul_add(bi, ai);
|
||||
r[i] = ai.mul_add(di, bi);
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
@@ -5324,32 +5323,64 @@ mod tests {
|
||||
}
|
||||
|
||||
/// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101)
|
||||
/// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary
|
||||
/// reuses vD as the accumulator: `vD <- (vA × vD) + vB`. Canary
|
||||
/// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs
|
||||
/// unconditionally before the fused multiply-add — the 128-bit form
|
||||
/// must match the scalar `vmaddfp` behaviour. Prior to this fix the
|
||||
/// interpreter skipped the flush, leaving subnormal noise in math-
|
||||
/// heavy game code.
|
||||
/// unconditionally before the fused multiply-add.
|
||||
#[test]
|
||||
fn vmaddfp128_flushes_denormal_inputs() {
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
let denorm = f32::from_bits(1);
|
||||
// vA=v2 carries denorms, which is also vD's accumulator input.
|
||||
// VA=v1, VD=v2, VB=v3 — all carry denormals.
|
||||
ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
|
||||
ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
|
||||
// vB=v3 = 1.0 — denormal input survives only if not flushed.
|
||||
ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]);
|
||||
// vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared
|
||||
// between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101.
|
||||
let raw: u32 = 0x1440_18D0;
|
||||
ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
|
||||
// vmaddfp128 vD=v2, vA=v1, vB=v3: op6=5, vd_lo=2, va_lo=1, vb_lo=3, key2=0b001101.
|
||||
// VA×VD+VB: all three flushed → 0*0+0 = 0.
|
||||
let raw: u32 = (5u32 << 26) | (2 << 21) | (1 << 16) | (3 << 11) | (3 << 6) | (1 << 4);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
// Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0.
|
||||
// With flush: 0*0 + 0 = 0.
|
||||
assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]);
|
||||
}
|
||||
|
||||
// ---- PPCBUG-424+425: vmaddfp128/vmaddcfp128 operand swap ----
|
||||
// ISA for both: (VD) <- (VA × VD) + VB. Previous code computed VA×VB+VD and VD×VB+VA.
|
||||
// Test uses distinct VA, VB, VD registers so the swap is visible.
|
||||
// Encoding: op6=5, key2=0b001101 (vmaddfp128) / 0b010001 (vmaddcfp128).
|
||||
// VA=v1=[2.0], VB=v2=[10.0], VD=v3=[3.0] → expected 2.0×3.0+10.0 = 16.0.
|
||||
// Buggy vmaddfp128: 2.0×10.0+3.0 = 23.0. Buggy vmaddcfp128: 3.0×10.0+2.0 = 32.0.
|
||||
|
||||
#[test]
|
||||
fn vmaddfp128_operand_order_va_times_vd_plus_vb() {
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]); // VA=v1
|
||||
ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
|
||||
ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]); // VD=v3 (also destination)
|
||||
// vmaddfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b001101 (bits22-25=3, bit27=1)
|
||||
let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (1 << 4);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmaddcfp128_operand_order_va_times_vd_plus_vb() {
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut mem = TestMem::new();
|
||||
ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]); // VA=v1
|
||||
ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
|
||||
ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]); // VD=v3
|
||||
// vmaddcfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b010001 (bits22-25=4, bit27=1)
|
||||
let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (4 << 6) | (1 << 4);
|
||||
write_instr(&mut mem, 0, raw);
|
||||
ctx.pc = 0;
|
||||
step(&mut ctx, &mut mem);
|
||||
assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
|
||||
}
|
||||
|
||||
/// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary
|
||||
/// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the
|
||||
/// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`.
|
||||
|
||||
Reference in New Issue
Block a user