diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs index ef28270..d4d8ac1 100644 --- a/crates/xenia-cpu/src/decoder.rs +++ b/crates/xenia-cpu/src/decoder.rs @@ -135,9 +135,9 @@ impl DecodedInstr { // VMX128 field extractors - /// VA128 (bits 6-10, plus bit from 29) + /// VA128 (bits 11-15, plus bit from 29) #[inline] pub fn va128(&self) -> usize { - (extract_bits(self.raw, 6, 10) | (extract_bits(self.raw, 29, 29) << 5)) as usize + (extract_bits(self.raw, 11, 15) | (extract_bits(self.raw, 29, 29) << 5)) as usize } /// VB128 (bits 16-20, plus bits from 28, 30) @@ -1105,8 +1105,8 @@ mod tests { #[test] fn vmx128_va128_uses_bit29() { - // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39. - let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29)); + // va128 = bits 11-15 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39. + let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 29)); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.va128(), 39); } diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs index a205dcf..eff89de 100644 --- a/crates/xenia-cpu/src/interpreter.rs +++ b/crates/xenia-cpu/src/interpreter.rs @@ -1919,11 +1919,10 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - ctx.pc += 4; } PpcOpcode::vmaddfp128 => { - // VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator; - // Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through - // `InstrEmit_vmaddfp_` with arg order swapped so the resulting - // HIR computation is `VA * VB + VD`). Same unconditional denorm - // flush of all three inputs as scalar `vmaddfp`. + // ISA: (VD) <- (VA × VD) + VB. VD is both the second multiplicand and destination. + // Canary InstrEmit_vmaddfp128 (ppc_emit_altivec.cc:806-809): MulAdd(VA, VD, VB). + // Previous code computed ai.mul_add(bi, di) = VA×VB+VD — VB and VD roles swapped + // (PPCBUG-424). Fix: ai.mul_add(di, bi) = VA×VD+VB. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let d = ctx.vr[instr.vd128()].as_f32x4(); @@ -1932,7 +1931,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let di = vmx::flush_denorm(d[i]); - r[i] = ai.mul_add(bi, di); + r[i] = ai.mul_add(di, bi); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -4297,11 +4296,11 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - // ═════════════════════════════════════════════════════════════════ // §4j — VMX128 FMA / permute // ═════════════════════════════════════════════════════════════════ - // vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator) + // vmaddcfp128: ISA (VD) <- (VA × VD) + VB — same operation as vmaddfp128 PpcOpcode::vmaddcfp128 => { - // Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both - // a source operand (as multiplicand) and the destination. Canary & - // POWER8 hardware confirm denormal inputs are flushed regardless of NJ. + // ISA: (VD) <- (VA × VD) + VB. Canary InstrEmit_vmaddcfp128 (cc:819): MulAdd(VA, VD, VB). + // Previous code computed di.mul_add(bi, ai) = VD×VB+VA — both operands wrong + // (PPCBUG-425). Fix: ai.mul_add(di, bi) = VA×VD+VB. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let d = ctx.vr[instr.vd128()].as_f32x4(); @@ -4310,7 +4309,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let di = vmx::flush_denorm(d[i]); - r[i] = di.mul_add(bi, ai); + r[i] = ai.mul_add(di, bi); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -5324,32 +5323,64 @@ mod tests { } /// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101) - /// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary + /// reuses vD as the accumulator: `vD <- (vA × vD) + vB`. Canary /// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs - /// unconditionally before the fused multiply-add — the 128-bit form - /// must match the scalar `vmaddfp` behaviour. Prior to this fix the - /// interpreter skipped the flush, leaving subnormal noise in math- - /// heavy game code. + /// unconditionally before the fused multiply-add. #[test] fn vmaddfp128_flushes_denormal_inputs() { let mut ctx = PpcContext::new(); let mut mem = TestMem::new(); let denorm = f32::from_bits(1); - // vA=v2 carries denorms, which is also vD's accumulator input. + // VA=v1, VD=v2, VB=v3 — all carry denormals. + ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); - // vB=v3 = 1.0 — denormal input survives only if not flushed. - ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]); - // vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared - // between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101. - let raw: u32 = 0x1440_18D0; + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); + // vmaddfp128 vD=v2, vA=v1, vB=v3: op6=5, vd_lo=2, va_lo=1, vb_lo=3, key2=0b001101. + // VA×VD+VB: all three flushed → 0*0+0 = 0. + let raw: u32 = (5u32 << 26) | (2 << 21) | (1 << 16) | (3 << 11) | (3 << 6) | (1 << 4); write_instr(&mut mem, 0, raw); ctx.pc = 0; step(&mut ctx, &mut mem); - // Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0. - // With flush: 0*0 + 0 = 0. assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]); } + // ---- PPCBUG-424+425: vmaddfp128/vmaddcfp128 operand swap ---- + // ISA for both: (VD) <- (VA × VD) + VB. Previous code computed VA×VB+VD and VD×VB+VA. + // Test uses distinct VA, VB, VD registers so the swap is visible. + // Encoding: op6=5, key2=0b001101 (vmaddfp128) / 0b010001 (vmaddcfp128). + // VA=v1=[2.0], VB=v2=[10.0], VD=v3=[3.0] → expected 2.0×3.0+10.0 = 16.0. + // Buggy vmaddfp128: 2.0×10.0+3.0 = 23.0. Buggy vmaddcfp128: 3.0×10.0+2.0 = 32.0. + + #[test] + fn vmaddfp128_operand_order_va_times_vd_plus_vb() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]); // VA=v1 + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2 + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]); // VD=v3 (also destination) + // vmaddfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b001101 (bits22-25=3, bit27=1) + let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (1 << 4); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16"); + } + + #[test] + fn vmaddcfp128_operand_order_va_times_vd_plus_vb() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]); // VA=v1 + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2 + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]); // VD=v3 + // vmaddcfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b010001 (bits22-25=4, bit27=1) + let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (4 << 6) | (1 << 4); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16"); + } + /// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary /// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the /// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`. diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs index 97b3825..2c301a5 100644 --- a/crates/xenia-cpu/tests/disasm_goldens.rs +++ b/crates/xenia-cpu/tests/disasm_goldens.rs @@ -514,12 +514,12 @@ fn vmx128_registers() { // vmaddcfp128 VD, VA, VD, VB → "v3, v35, v3, v5" // vnmsubfp128 VD, VA, VD, VB → "v3, v35, v3, v5" let vmx128_4op = [ - // vmaddfp128: bits 24=1, 25=1, 27=1, bit 29=1 (VA high), VB=5 - (0x146028D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"), - // vmaddcfp128: bits 23=1, 27=1, bit 29=1, VB=5 - (0x14602914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"), - // vnmsubfp128: bits 23=1, 25=1, 27=1, bit 29=1, VB=5 - (0x14602954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"), + // vmaddfp128: vd=3(bits 6-10), va=35(bits 11-15=3 + bit29=1), vb=5(bits 16-20), key2=0b001101 + (0x146328D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"), + // vmaddcfp128: same vd/va/vb layout, key2=0b010001 + (0x14632914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"), + // vnmsubfp128: same vd/va/vb layout, key2=0b010101 + (0x14632954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"), ]; let mut all = Vec::new(); diff --git a/crates/xenia-cpu/tests/golden/vmx128_registers.json b/crates/xenia-cpu/tests/golden/vmx128_registers.json index 1d072cd..6d4f8b0 100644 --- a/crates/xenia-cpu/tests/golden/vmx128_registers.json +++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json @@ -75,21 +75,21 @@ "raw": "0x14642801", "addr": "0x82000000", "mnemonic": "vperm128", - "operands": "v3, v3, v5, 0" + "operands": "v3, v4, v5, 0" }, { "label": "encoding vd_hi=00: actually vsrw128", "raw": "0x180461D0", "addr": "0x82000000", "mnemonic": "vsrw128", - "operands": "v0, v0, v12" + "operands": "v0, v4, v12" }, { "label": "encoding vd_hi=10: actually vsrw128 v32", "raw": "0x180465D0", "addr": "0x82000000", "mnemonic": "vsrw128", - "operands": "v32, v0, v12" + "operands": "v32, v4, v12" }, { "label": "encoding vd_hi=01: actually vpermwi128", @@ -114,21 +114,21 @@ }, { "label": "vmaddfp128 v3, v35, v5, v3", - "raw": "0x146028D4", + "raw": "0x146328D4", "addr": "0x82000000", "mnemonic": "vmaddfp128", "operands": "v3, v35, v5, v3" }, { "label": "vmaddcfp128 v3, v35, v3, v5", - "raw": "0x14602914", + "raw": "0x14632914", "addr": "0x82000000", "mnemonic": "vmaddcfp128", "operands": "v3, v35, v3, v5" }, { "label": "vnmsubfp128 v3, v35, v3, v5", - "raw": "0x14602954", + "raw": "0x14632954", "addr": "0x82000000", "mnemonic": "vnmsubfp128", "operands": "v3, v35, v3, v5"