diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index ef28270..d4d8ac1 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -135,9 +135,9 @@ impl DecodedInstr {
 
     // VMX128 field extractors
 
-    /// VA128 (bits 6-10, plus bit from 29)
+    /// VA128 (bits 11-15, plus bit from 29)
     #[inline] pub fn va128(&self) -> usize {
-        (extract_bits(self.raw, 6, 10) | (extract_bits(self.raw, 29, 29) << 5)) as usize
+        (extract_bits(self.raw, 11, 15) | (extract_bits(self.raw, 29, 29) << 5)) as usize
     }
 
     /// VB128 (bits 16-20, plus bits from 28, 30)
@@ -1105,8 +1105,8 @@ mod tests {
 
     #[test]
     fn vmx128_va128_uses_bit29() {
-        // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
-        let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29));
+        // va128 = bits 11-15 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
+        let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 29));
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.va128(), 39);
     }
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index a205dcf..eff89de 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -1919,11 +1919,10 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::vmaddfp128 => {
-            // VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator;
-            // Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through
-            // `InstrEmit_vmaddfp_` with arg order swapped so the resulting
-            // HIR computation is `VA * VB + VD`). Same unconditional denorm
-            // flush of all three inputs as scalar `vmaddfp`.
+            // ISA: (VD) <- (VA × VD) + VB. VD is both the second multiplicand and destination.
+            // Canary InstrEmit_vmaddfp128 (ppc_emit_altivec.cc:806-809): MulAdd(VA, VD, VB).
+            // Previous code computed ai.mul_add(bi, di) = VA×VB+VD — VB and VD roles swapped
+            // (PPCBUG-424). Fix: ai.mul_add(di, bi) = VA×VD+VB.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let d = ctx.vr[instr.vd128()].as_f32x4();
@@ -1932,7 +1931,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(bi, di);
+                r[i] = ai.mul_add(di, bi);
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -4297,11 +4296,11 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         // ═════════════════════════════════════════════════════════════════
         // §4j — VMX128 FMA / permute
         // ═════════════════════════════════════════════════════════════════
-        // vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator)
+        // vmaddcfp128: ISA (VD) <- (VA × VD) + VB — same operation as vmaddfp128
         PpcOpcode::vmaddcfp128 => {
-            // Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both
-            // a source operand (as multiplicand) and the destination. Canary &
-            // POWER8 hardware confirm denormal inputs are flushed regardless of NJ.
+            // ISA: (VD) <- (VA × VD) + VB. Canary InstrEmit_vmaddcfp128 (cc:819): MulAdd(VA, VD, VB).
+            // Previous code computed di.mul_add(bi, ai) = VD×VB+VA — both operands wrong
+            // (PPCBUG-425). Fix: ai.mul_add(di, bi) = VA×VD+VB.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let d = ctx.vr[instr.vd128()].as_f32x4();
@@ -4310,7 +4309,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = di.mul_add(bi, ai);
+                r[i] = ai.mul_add(di, bi);
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -5324,32 +5323,64 @@ mod tests {
     }
 
     /// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101)
-    /// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary
+    /// reuses vD as the accumulator: `vD <- (vA × vD) + vB`. Canary
     /// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs
-    /// unconditionally before the fused multiply-add — the 128-bit form
-    /// must match the scalar `vmaddfp` behaviour. Prior to this fix the
-    /// interpreter skipped the flush, leaving subnormal noise in math-
-    /// heavy game code.
+    /// unconditionally before the fused multiply-add.
     #[test]
     fn vmaddfp128_flushes_denormal_inputs() {
         let mut ctx = PpcContext::new();
         let mut mem = TestMem::new();
         let denorm = f32::from_bits(1);
-        // vA=v2 carries denorms, which is also vD's accumulator input.
+        // VA=v1, VD=v2, VB=v3 — all carry denormals.
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
         ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
-        // vB=v3 = 1.0 — denormal input survives only if not flushed.
-        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]);
-        // vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared
-        // between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101.
-        let raw: u32 = 0x1440_18D0;
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
+        // vmaddfp128 vD=v2, vA=v1, vB=v3: op6=5, vd_lo=2, va_lo=1, vb_lo=3, key2=0b001101.
+        // VA×VD+VB: all three flushed → 0*0+0 = 0.
+        let raw: u32 = (5u32 << 26) | (2 << 21) | (1 << 16) | (3 << 11) | (3 << 6) | (1 << 4);
         write_instr(&mut mem, 0, raw);
         ctx.pc = 0;
         step(&mut ctx, &mut mem);
-        // Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0.
-        // With flush:    0*0 + 0 = 0.
         assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]);
     }
 
+    // ---- PPCBUG-424+425: vmaddfp128/vmaddcfp128 operand swap ----
+    // ISA for both: (VD) <- (VA × VD) + VB. Previous code computed VA×VB+VD and VD×VB+VA.
+    // Test uses distinct VA, VB, VD registers so the swap is visible.
+    // Encoding: op6=5, key2=0b001101 (vmaddfp128) / 0b010001 (vmaddcfp128).
+    // VA=v1=[2.0], VB=v2=[10.0], VD=v3=[3.0] → expected 2.0×3.0+10.0 = 16.0.
+    // Buggy vmaddfp128: 2.0×10.0+3.0 = 23.0. Buggy vmaddcfp128: 3.0×10.0+2.0 = 32.0.
+
+    #[test]
+    fn vmaddfp128_operand_order_va_times_vd_plus_vb() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]);  // VA=v1
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]);  // VD=v3 (also destination)
+        // vmaddfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b001101 (bits22-25=3, bit27=1)
+        let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (1 << 4);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
+    }
+
+    #[test]
+    fn vmaddcfp128_operand_order_va_times_vd_plus_vb() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]);  // VA=v1
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]);  // VD=v3
+        // vmaddcfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b010001 (bits22-25=4, bit27=1)
+        let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (4 << 6) | (1 << 4);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
+    }
+
     /// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary
     /// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the
     /// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`.
diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs
index 97b3825..2c301a5 100644
--- a/crates/xenia-cpu/tests/disasm_goldens.rs
+++ b/crates/xenia-cpu/tests/disasm_goldens.rs
@@ -514,12 +514,12 @@ fn vmx128_registers() {
     //     vmaddcfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
     //     vnmsubfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
     let vmx128_4op = [
-        // vmaddfp128: bits 24=1, 25=1, 27=1, bit 29=1 (VA high), VB=5
-        (0x146028D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
-        // vmaddcfp128: bits 23=1, 27=1, bit 29=1, VB=5
-        (0x14602914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
-        // vnmsubfp128: bits 23=1, 25=1, 27=1, bit 29=1, VB=5
-        (0x14602954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
+        // vmaddfp128: vd=3(bits 6-10), va=35(bits 11-15=3 + bit29=1), vb=5(bits 16-20), key2=0b001101
+        (0x146328D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
+        // vmaddcfp128: same vd/va/vb layout, key2=0b010001
+        (0x14632914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
+        // vnmsubfp128: same vd/va/vb layout, key2=0b010101
+        (0x14632954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
     ];
 
     let mut all = Vec::new();
diff --git a/crates/xenia-cpu/tests/golden/vmx128_registers.json b/crates/xenia-cpu/tests/golden/vmx128_registers.json
index 1d072cd..6d4f8b0 100644
--- a/crates/xenia-cpu/tests/golden/vmx128_registers.json
+++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json
@@ -75,21 +75,21 @@
       "raw": "0x14642801",
       "addr": "0x82000000",
       "mnemonic": "vperm128",
-      "operands": "v3, v3, v5, 0"
+      "operands": "v3, v4, v5, 0"
     },
     {
       "label": "encoding vd_hi=00: actually vsrw128",
       "raw": "0x180461D0",
       "addr": "0x82000000",
       "mnemonic": "vsrw128",
-      "operands": "v0, v0, v12"
+      "operands": "v0, v4, v12"
     },
     {
       "label": "encoding vd_hi=10: actually vsrw128 v32",
       "raw": "0x180465D0",
       "addr": "0x82000000",
       "mnemonic": "vsrw128",
-      "operands": "v32, v0, v12"
+      "operands": "v32, v4, v12"
     },
     {
       "label": "encoding vd_hi=01: actually vpermwi128",
@@ -114,21 +114,21 @@
     },
     {
       "label": "vmaddfp128 v3, v35, v5, v3",
-      "raw": "0x146028D4",
+      "raw": "0x146328D4",
       "addr": "0x82000000",
       "mnemonic": "vmaddfp128",
       "operands": "v3, v35, v5, v3"
     },
     {
       "label": "vmaddcfp128 v3, v35, v3, v5",
-      "raw": "0x14602914",
+      "raw": "0x14632914",
       "addr": "0x82000000",
       "mnemonic": "vmaddcfp128",
       "operands": "v3, v35, v3, v5"
     },
     {
       "label": "vnmsubfp128 v3, v35, v3, v5",
-      "raw": "0x14602954",
+      "raw": "0x14632954",
       "addr": "0x82000000",
       "mnemonic": "vnmsubfp128",
       "operands": "v3, v35, v3, v5"