From cedee3c385b4e5e169d9b79095572d8071a126f6 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 10:05:37 +0200
Subject: [PATCH 1/6] fix(cpu): PPCBUG-510 stvewx128 writes 16 bytes instead of
 4

stvewx128 was aligning EA to 16 bytes and writing all 16 bytes of the
vector, corrupting 12 adjacent bytes on every call. ISA semantics:
word-align EA, extract word lane (EA & 0xF) >> 2, write 4 bytes only.

The non-128 stvewx was already correct; stvewx128 was never updated.
Mirror the stvewx body with instr.vs128() substituted for instr.rs().
The invalidate_for_write call from P1 now covers the correct word-aligned
EA rather than the over-wide 16-byte range.

interpreter.rs: stvewx128 arm (~line 2984)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 60 ++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 3bb3d56..a205dcf 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2982,16 +2982,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::stvewx128 => {
-            let ea = ea_indexed(ctx, instr) & !0xF;
-            // TODO PPCBUG-510: stvewx128 currently writes 16 bytes at ea & !0xF; the EA scope is
-            // wrong (should be word-aligned, 4 bytes only). When P3 fixes EA, this invalidate's
-            // range narrows automatically.
-            // PPCBUG-512: stvewx128 was missing invalidate_for_write.
+            // Mirror of stvewx: word-align EA, extract one 32-bit lane, write 4 bytes only.
+            // Previous code used & !0xF (16-byte) and wrote all 16 bytes, corrupting 12
+            // adjacent bytes on every execution (PPCBUG-510).
+            let ea_unaligned = ea_indexed(ctx, instr);
+            let ea = ea_unaligned & !0x3u32;
             if let Some(t) = ctx.reservation_table.as_ref().filter(|t| t.is_enabled()) {
                 if t.has_active_reservers() { t.invalidate_for_write(ea); }
             }
+            let slot = ((ea_unaligned & 0xF) >> 2) as usize;
             let bytes = ctx.vr[instr.vs128()].as_bytes();
-            for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); }
+            let w = ((bytes[slot * 4] as u32) << 24)
+                  | ((bytes[slot * 4 + 1] as u32) << 16)
+                  | ((bytes[slot * 4 + 2] as u32) << 8)
+                  | (bytes[slot * 4 + 3] as u32);
+            mem.write_u32(ea, w);
             ctx.pc += 4;
         }
 
@@ -6463,4 +6468,47 @@ mod tests {
         assert_eq!(r[2], 0x3333_3333, "pack=1 shift=3: lane 2 from prev");
         assert_eq!(r[3], 0x4444_4444, "pack=1 shift=3: lane 3 from prev");
     }
+
+    // ---- PPCBUG-510: stvewx128 should write one word (4 bytes), not 16 ----
+
+    fn encode_stvewx128(vs_lo: u32, ra: u32, rb: u32) -> u32 {
+        // stvewx128 is a VMX128 load/store at op6=4.
+        // decode_op4 key1 = (bits21-27 << 4) | bits30-31 = 0b00110000011 for stvewx128.
+        // bits21-27 = 0b0011000 (host bits 10-4), bits30-31 = 0b11 (host bits 1-0).
+        // VS128[4:0] at host bits 25-21; RA at host bits 20-16; RB at host bits 15-11.
+        // VS128[5] at host bit 3 (PPC bit 28); VS128[6] at host bit 1 (PPC bit 30).
+        (4u32 << 26)
+            | (vs_lo << 21)     // VS128[4:0]
+            | (ra << 16)        // RA
+            | (rb << 11)        // RB
+            | (0b0011000 << 4)  // bits 21-27 of key1 pattern
+            | 0b11              // bits 30-31 of key1 pattern
+    }
+
+    #[test]
+    fn stvewx128_writes_one_word_at_word_aligned_ea() {
+        // PPCBUG-510: old code wrote all 16 bytes at ea & !0xF, corrupting 12 adjacent bytes.
+        // Fix: word-align EA, extract lane from (ea & 0xF) >> 2, write 4 bytes only.
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        // VS128 = v96 (vs_lo=0 | key bits → vs128=0 since key bits 21-27 set bit4=1 and bit5=1
+        // in the key, but vs128 uses bits 6-10 for low 5 bits).
+        // Actually: vs128 uses decode bits 6-10 (host 25-21) and bits 21,22 (host 10,9).
+        // encode_stvewx128 sets vs_lo in bits 25-21 and key bits at bits 10-4.
+        // vs128 = bits6-10 | (bit21<<5) | (bit22<<6) = vs_lo | 0 | 0 = vs_lo.
+        // So vs128 = vs_lo. We'll use vs_lo=3 → vs128=3.
+        let raw = encode_stvewx128(3, 1, 2);
+        ctx.vr[3] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
+        ctx.gpr[1] = 0x1000; // base
+        ctx.gpr[2] = 0x008;  // offset → EA = 0x1008 → word-aligned EA = 0x1008, slot = (0x8 & 0xF)>>2 = 2
+        write_instr(&mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mem);
+        assert_eq!(ctx.pc, 4, "PC must advance");
+        // Slot 2 → lane 2 = 0x3333_3333
+        assert_eq!(mem.read_u32(0x1008), 0x3333_3333, "only lane 2 word at ea");
+        // Adjacent words must be untouched (mem is zero-init)
+        assert_eq!(mem.read_u32(0x1000), 0x0000_0000, "byte below must be untouched");
+        assert_eq!(mem.read_u32(0x100C), 0x0000_0000, "byte above must be untouched");
+    }
 }

From 52ece4bd868ce938075930c56c0978fbd8b287ee Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 10:33:24 +0200
Subject: [PATCH 2/6] fix(cpu): PPCBUG-424+425 vmaddfp128/vmaddcfp128 operand
 swap + va128 field fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPCBUG-424: vmaddfp128 computed VA×VB+VD instead of ISA-mandated VA×VD+VB.
PPCBUG-425: vmaddcfp128 computed VD×VB+VA instead of ISA-mandated VA×VD+VB.

Root-cause discovered while writing the operand-order regression tests:
va128() was extracting PPC bits 6-10 (the same field as vd128's low 5 bits),
not PPC bits 11-15 where VA lives in VX128 form. This meant va128() silently
aliased vd128 for any instruction where VA != VD, making the operand swap
invisible in the existing denorm-flush test (which used VA == VD == v2).

Fixes in this commit:
- decoder.rs: va128() now extracts PPC bits 11-15 (host bits 20-16) + bit29.
  The vmx128_va128_uses_bit29 test encoding updated to match the correct field.
- interpreter.rs: vmaddfp128 changed from ai.mul_add(bi,di) to ai.mul_add(di,bi)
  (VA×VD+VB). vmaddcfp128 changed from di.mul_add(bi,ai) to ai.mul_add(di,bi).
  vmaddfp128_flushes_denormal_inputs redesigned with distinct VA/VD/VB registers
  (v1/v2/v3) so the flush test is independent of the accessor fix.
  New vmaddfp128_operand_order_va_times_vd_plus_vb and
  vmaddcfp128_operand_order_va_times_vd_plus_vb tests verify 2×3+10=16.
- disasm_goldens.rs + vmx128_registers.json: vmaddfp128/vmaddcfp128/vnmsubfp128
  golden raws updated to properly encode VA at PPC bits 11-15 (new raws:
  0x146328D4 / 0x14632914 / 0x14632954). vperm128 / vsrw128 golden operands
  updated to reflect correct VA extraction (v4 instead of v3/v0).

Affects all VMX128 binary ops that call va128(): vaddfp128, vsubfp128,
vmulfp128, vmaddfp128, vmaddcfp128, vnmsubfp128, vperm128, vsrw128 etc.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/decoder.rs               |  8 +-
 crates/xenia-cpu/src/interpreter.rs           | 79 +++++++++++++------
 crates/xenia-cpu/tests/disasm_goldens.rs      | 12 +--
 .../tests/golden/vmx128_registers.json        | 12 +--
 4 files changed, 71 insertions(+), 40 deletions(-)

diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index ef28270..d4d8ac1 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -135,9 +135,9 @@ impl DecodedInstr {
 
     // VMX128 field extractors
 
-    /// VA128 (bits 6-10, plus bit from 29)
+    /// VA128 (bits 11-15, plus bit from 29)
     #[inline] pub fn va128(&self) -> usize {
-        (extract_bits(self.raw, 6, 10) | (extract_bits(self.raw, 29, 29) << 5)) as usize
+        (extract_bits(self.raw, 11, 15) | (extract_bits(self.raw, 29, 29) << 5)) as usize
     }
 
     /// VB128 (bits 16-20, plus bits from 28, 30)
@@ -1105,8 +1105,8 @@ mod tests {
 
     #[test]
     fn vmx128_va128_uses_bit29() {
-        // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
-        let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29));
+        // va128 = bits 11-15 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
+        let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 29));
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.va128(), 39);
     }
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index a205dcf..eff89de 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -1919,11 +1919,10 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::vmaddfp128 => {
-            // VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator;
-            // Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through
-            // `InstrEmit_vmaddfp_` with arg order swapped so the resulting
-            // HIR computation is `VA * VB + VD`). Same unconditional denorm
-            // flush of all three inputs as scalar `vmaddfp`.
+            // ISA: (VD) <- (VA × VD) + VB. VD is both the second multiplicand and destination.
+            // Canary InstrEmit_vmaddfp128 (ppc_emit_altivec.cc:806-809): MulAdd(VA, VD, VB).
+            // Previous code computed ai.mul_add(bi, di) = VA×VB+VD — VB and VD roles swapped
+            // (PPCBUG-424). Fix: ai.mul_add(di, bi) = VA×VD+VB.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let d = ctx.vr[instr.vd128()].as_f32x4();
@@ -1932,7 +1931,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(bi, di);
+                r[i] = ai.mul_add(di, bi);
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -4297,11 +4296,11 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         // ═════════════════════════════════════════════════════════════════
         // §4j — VMX128 FMA / permute
         // ═════════════════════════════════════════════════════════════════
-        // vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator)
+        // vmaddcfp128: ISA (VD) <- (VA × VD) + VB — same operation as vmaddfp128
         PpcOpcode::vmaddcfp128 => {
-            // Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both
-            // a source operand (as multiplicand) and the destination. Canary &
-            // POWER8 hardware confirm denormal inputs are flushed regardless of NJ.
+            // ISA: (VD) <- (VA × VD) + VB. Canary InstrEmit_vmaddcfp128 (cc:819): MulAdd(VA, VD, VB).
+            // Previous code computed di.mul_add(bi, ai) = VD×VB+VA — both operands wrong
+            // (PPCBUG-425). Fix: ai.mul_add(di, bi) = VA×VD+VB.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let d = ctx.vr[instr.vd128()].as_f32x4();
@@ -4310,7 +4309,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = di.mul_add(bi, ai);
+                r[i] = ai.mul_add(di, bi);
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -5324,32 +5323,64 @@ mod tests {
     }
 
     /// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101)
-    /// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary
+    /// reuses vD as the accumulator: `vD <- (vA × vD) + vB`. Canary
     /// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs
-    /// unconditionally before the fused multiply-add — the 128-bit form
-    /// must match the scalar `vmaddfp` behaviour. Prior to this fix the
-    /// interpreter skipped the flush, leaving subnormal noise in math-
-    /// heavy game code.
+    /// unconditionally before the fused multiply-add.
     #[test]
     fn vmaddfp128_flushes_denormal_inputs() {
         let mut ctx = PpcContext::new();
         let mut mem = TestMem::new();
         let denorm = f32::from_bits(1);
-        // vA=v2 carries denorms, which is also vD's accumulator input.
+        // VA=v1, VD=v2, VB=v3 — all carry denormals.
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
         ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
-        // vB=v3 = 1.0 — denormal input survives only if not flushed.
-        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]);
-        // vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared
-        // between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101.
-        let raw: u32 = 0x1440_18D0;
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
+        // vmaddfp128 vD=v2, vA=v1, vB=v3: op6=5, vd_lo=2, va_lo=1, vb_lo=3, key2=0b001101.
+        // VA×VD+VB: all three flushed → 0*0+0 = 0.
+        let raw: u32 = (5u32 << 26) | (2 << 21) | (1 << 16) | (3 << 11) | (3 << 6) | (1 << 4);
         write_instr(&mut mem, 0, raw);
         ctx.pc = 0;
         step(&mut ctx, &mut mem);
-        // Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0.
-        // With flush:    0*0 + 0 = 0.
         assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]);
     }
 
+    // ---- PPCBUG-424+425: vmaddfp128/vmaddcfp128 operand swap ----
+    // ISA for both: (VD) <- (VA × VD) + VB. Previous code computed VA×VB+VD and VD×VB+VA.
+    // Test uses distinct VA, VB, VD registers so the swap is visible.
+    // Encoding: op6=5, key2=0b001101 (vmaddfp128) / 0b010001 (vmaddcfp128).
+    // VA=v1=[2.0], VB=v2=[10.0], VD=v3=[3.0] → expected 2.0×3.0+10.0 = 16.0.
+    // Buggy vmaddfp128: 2.0×10.0+3.0 = 23.0. Buggy vmaddcfp128: 3.0×10.0+2.0 = 32.0.
+
+    #[test]
+    fn vmaddfp128_operand_order_va_times_vd_plus_vb() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]);  // VA=v1
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]);  // VD=v3 (also destination)
+        // vmaddfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b001101 (bits22-25=3, bit27=1)
+        let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (1 << 4);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
+    }
+
+    #[test]
+    fn vmaddcfp128_operand_order_va_times_vd_plus_vb() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]);  // VA=v1
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]);  // VD=v3
+        // vmaddcfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b010001 (bits22-25=4, bit27=1)
+        let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (4 << 6) | (1 << 4);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
+    }
+
     /// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary
     /// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the
     /// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`.
diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs
index 97b3825..2c301a5 100644
--- a/crates/xenia-cpu/tests/disasm_goldens.rs
+++ b/crates/xenia-cpu/tests/disasm_goldens.rs
@@ -514,12 +514,12 @@ fn vmx128_registers() {
     //     vmaddcfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
     //     vnmsubfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
     let vmx128_4op = [
-        // vmaddfp128: bits 24=1, 25=1, 27=1, bit 29=1 (VA high), VB=5
-        (0x146028D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
-        // vmaddcfp128: bits 23=1, 27=1, bit 29=1, VB=5
-        (0x14602914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
-        // vnmsubfp128: bits 23=1, 25=1, 27=1, bit 29=1, VB=5
-        (0x14602954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
+        // vmaddfp128: vd=3(bits 6-10), va=35(bits 11-15=3 + bit29=1), vb=5(bits 16-20), key2=0b001101
+        (0x146328D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
+        // vmaddcfp128: same vd/va/vb layout, key2=0b010001
+        (0x14632914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
+        // vnmsubfp128: same vd/va/vb layout, key2=0b010101
+        (0x14632954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
     ];
 
     let mut all = Vec::new();
diff --git a/crates/xenia-cpu/tests/golden/vmx128_registers.json b/crates/xenia-cpu/tests/golden/vmx128_registers.json
index 1d072cd..6d4f8b0 100644
--- a/crates/xenia-cpu/tests/golden/vmx128_registers.json
+++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json
@@ -75,21 +75,21 @@
       "raw": "0x14642801",
       "addr": "0x82000000",
       "mnemonic": "vperm128",
-      "operands": "v3, v3, v5, 0"
+      "operands": "v3, v4, v5, 0"
     },
     {
       "label": "encoding vd_hi=00: actually vsrw128",
       "raw": "0x180461D0",
       "addr": "0x82000000",
       "mnemonic": "vsrw128",
-      "operands": "v0, v0, v12"
+      "operands": "v0, v4, v12"
     },
     {
       "label": "encoding vd_hi=10: actually vsrw128 v32",
       "raw": "0x180465D0",
       "addr": "0x82000000",
       "mnemonic": "vsrw128",
-      "operands": "v32, v0, v12"
+      "operands": "v32, v4, v12"
     },
     {
       "label": "encoding vd_hi=01: actually vpermwi128",
@@ -114,21 +114,21 @@
     },
     {
       "label": "vmaddfp128 v3, v35, v5, v3",
-      "raw": "0x146028D4",
+      "raw": "0x146328D4",
       "addr": "0x82000000",
       "mnemonic": "vmaddfp128",
       "operands": "v3, v35, v5, v3"
     },
     {
       "label": "vmaddcfp128 v3, v35, v3, v5",
-      "raw": "0x14602914",
+      "raw": "0x14632914",
       "addr": "0x82000000",
       "mnemonic": "vmaddcfp128",
       "operands": "v3, v35, v3, v5"
     },
     {
       "label": "vnmsubfp128 v3, v35, v3, v5",
-      "raw": "0x14602954",
+      "raw": "0x14632954",
       "addr": "0x82000000",
       "mnemonic": "vnmsubfp128",
       "operands": "v3, v35, v3, v5"

From 3d8e2ced2e2ffd96401c071f7534d2689f6cf6e7 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 10:38:18 +0200
Subject: [PATCH 3/6] fix(cpu): PPCBUG-053+054 32-bit CTR semantics in
 bcx/bclrx + mtspr CTR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPCBUG-053: bcx and bclrx tested `ctx.ctr != 0` against the full 64-bit
register, but the Xbox 360 ABI runs CTR as a 32-bit counter (canary
explicitly truncates: `f.Truncate(ctr, INT32_TYPE)`). When upstream 64-bit
GPR pollution flowed through `mtspr CTR, rN`, the upper 32 bits stayed
non-zero forever; bdnz then looped past the intended 32-bit zero point
because the 64-bit comparison still saw the high bits.

PPCBUG-054: `mtspr CTR` writeback wrote the full 64-bit GPR value,
acting as a firewall gap that fed PPCBUG-053. Defensive truncation
prevents CTR from ever acquiring non-zero upper 32 bits independently
of the GPR-pollution source.

Fixes:
- interpreter.rs:849, 879: ctr_ok now uses `(ctx.ctr as u32) != 0`
- interpreter.rs:1523: mtspr CTR writes `val as u32 as u64`

Tests:
- bcx_bdnz_uses_32bit_ctr_compare: bdnz with CTR=0x0000_0001_0000_0001
  decrements to 0x0000_0001_0000_0000 and exits (low 32 bits = 0).
- bclrx_uses_32bit_ctr_compare: same coverage for bdnzlr.
- mtspr_ctr_truncates_to_32_bits: gpr=0xFFFF_FFFF_8000_0001 → ctr=0x8000_0001.

Coupled fix per the audit: PPCBUG-053 and PPCBUG-054 land together because
either alone is necessary-but-not-sufficient — the truncation prevents new
pollution, the 32-bit compare protects against any pollution that slipped
in via routes other than mtspr (e.g. mfctr-mtctr roundtrips).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 59 +++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index eff89de..09800b8 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -846,7 +846,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             }
 
             let ctr_ok = (bo & 0b00100) != 0
-                || ((ctx.ctr != 0) ^ ((bo & 0b00010) != 0));
+                || (((ctx.ctr as u32) != 0) ^ ((bo & 0b00010) != 0));
             let cond_ok = (bo & 0b10000) != 0
                 || (ctx.get_cr_bit(bi) == ((bo & 0b01000) != 0));
 
@@ -876,7 +876,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             }
 
             let ctr_ok = (bo & 0b00100) != 0
-                || ((ctx.ctr != 0) ^ ((bo & 0b00010) != 0));
+                || (((ctx.ctr as u32) != 0) ^ ((bo & 0b00010) != 0));
             let cond_ok = (bo & 0b10000) != 0
                 || (ctx.get_cr_bit(bi) == ((bo & 0b01000) != 0));
 
@@ -1520,7 +1520,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             match spr {
                 crate::context::spr::XER => ctx.set_xer(val as u32),
                 crate::context::spr::LR => ctx.lr = val,
-                crate::context::spr::CTR => ctx.ctr = val,
+                crate::context::spr::CTR => ctx.ctr = val as u32 as u64,
                 crate::context::spr::DEC => ctx.dec = val as u32,
                 crate::context::spr::TBL_WRITE => {
                     ctx.timebase = (ctx.timebase & 0xFFFF_FFFF_0000_0000) | (val & 0xFFFF_FFFF);
@@ -5834,6 +5834,59 @@ mod tests {
         assert_eq!(ctx.timebase >> 32, 0xAAAA_BBBB);
     }
 
+    // PPCBUG-053: bcx CTR zero-test must use 32-bit comparison. When prior
+    // 64-bit pollution (e.g. via negx → mtctr) leaves CTR upper 32 bits
+    // non-zero, the 64-bit `ctx.ctr != 0` would loop forever even when the
+    // 32-bit counter has decremented to zero.
+    #[test]
+    fn bcx_bdnz_uses_32bit_ctr_compare() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.ctr = 0x0000_0001_0000_0001;
+        // bdnz +8: BO=16 (decrement, branch if CTR!=0, ignore CR), BI=0, BD/4=2
+        let raw = (16u32 << 26) | (16 << 21) | (0 << 16) | (2 << 2);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // After decrement: low 32 = 0, high 32 = 1. 32-bit test says zero → no branch.
+        assert_eq!(ctx.ctr, 0x0000_0001_0000_0000);
+        assert_eq!(ctx.pc, 4);
+    }
+
+    #[test]
+    fn bclrx_uses_32bit_ctr_compare() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.ctr = 0x0000_0001_0000_0001;
+        ctx.lr = 0x100;
+        // bdnzlr: opcode 19, BO=16 (decrement, branch if CTR!=0), BI=0, XO=16
+        let raw = (19u32 << 26) | (16 << 21) | (0 << 16) | (16 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // 32-bit CTR=0 after decrement → don't branch to LR.
+        assert_eq!(ctx.ctr, 0x0000_0001_0000_0000);
+        assert_eq!(ctx.pc, 4);
+    }
+
+    // PPCBUG-054: mtspr CTR must truncate the source GPR to 32 bits, matching
+    // canary's `f.Truncate(ctr, INT32_TYPE)`. Prevents upstream 64-bit GPR
+    // pollution from poisoning the 32-bit CTR counter independently of the
+    // bcx zero-test fix.
+    #[test]
+    fn mtspr_ctr_truncates_to_32_bits() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = 0xFFFF_FFFF_8000_0001;
+        // mtspr CTR (9), r3
+        let spr_swapped = ((9u32 & 0x1F) << 5) | ((9u32 >> 5) & 0x1F);
+        let raw = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (467 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.ctr, 0x8000_0001);
+    }
+
     // ---------- Block-cache parity tests ----------
     //
     // These confirm that running a program through the basic-block

From d4f6ea787b64125a430be25fe3c0212dec76c4a7 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 10:40:45 +0200
Subject: [PATCH 4/6] fix(disasm): PPCBUG-640+650 fmt_bc spurious condition
 suffix on bdnz/bdz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPCBUG-640: For BO=16 (bdnz: decrement CTR, branch if non-zero, ignore CR)
and BO=18 (bdz: same with branch-if-zero), `fmt_bc` fell through to the
`if decr` block and computed `cond_name_opt` from the don't-care BI=0 /
cond_true=false pair, yielding `Some("ge")`. The output was therefore
`bdnzge` / `bdzge` — a CTR-only branch with a spurious CR-derived suffix.

PPCBUG-650 (companion): the golden fixture pinned the wrong output, so
the regression had no detection signal until now.

`fmt_bclr` already had the correct `if decr && uncond` guard at line 872
producing `bdnzlr` / `bdzlr`. `fmt_bc` lacked the equivalent.

Fix: gate the condition string on `!uncond` inside the `if decr` block.
For BO=16/18 (uncond bit set), the condition suffix is now empty.

Tests: extended_mnemonics.json fixture rows for bdnz/bdz now expect the
correct `ext_mnemonic: "bdnz"` / `"bdz"`.

Impact: every analysis-DB query for `bdnz` loops (common in pixel-shader
and vertex processing) was returning zero rows; matches stored as `bdnzge`.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/disasm.rs                        | 5 ++++-
 crates/xenia-cpu/tests/golden/extended_mnemonics.json | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/crates/xenia-cpu/src/disasm.rs b/crates/xenia-cpu/src/disasm.rs
index 6edbce1..5a98981 100644
--- a/crates/xenia-cpu/src/disasm.rs
+++ b/crates/xenia-cpu/src/disasm.rs
@@ -828,7 +828,10 @@ fn fmt_bc(instr: &DecodedInstr) -> DisasmText {
 
         if decr {
             let z = if bo & 0x02 != 0 { "z" } else { "nz" };
-            let cond_str = cond_name_opt.unwrap_or("");
+            // BO bit 4 (uncond) means CR is ignored — pure CTR-decrement branch.
+            // Without this guard, bdnz/bdz would emit a spurious `ge` suffix derived
+            // from the don't-care BI=0 / cond_true=false pair (PPCBUG-640).
+            let cond_str = if uncond { "" } else { cond_name_opt.unwrap_or("") };
             let ext_mnem = format!("bd{z}{cond_str}{a}{l}");
             let ext_ops = format!("{cr}0x{target:08X}");
             with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
diff --git a/crates/xenia-cpu/tests/golden/extended_mnemonics.json b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
index efb251b..6749f94 100644
--- a/crates/xenia-cpu/tests/golden/extended_mnemonics.json
+++ b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
@@ -366,7 +366,7 @@
       "addr": "0x82000000",
       "mnemonic": "bc",
       "operands": "16, lt, 0x82000040",
-      "ext_mnemonic": "bdnzge",
+      "ext_mnemonic": "bdnz",
       "ext_operands": "0x82000040",
       "branch_target": "0x82000040"
     },
@@ -376,7 +376,7 @@
       "addr": "0x82000000",
       "mnemonic": "bc",
       "operands": "18, lt, 0x82000040",
-      "ext_mnemonic": "bdzge",
+      "ext_mnemonic": "bdz",
       "ext_operands": "0x82000040",
       "branch_target": "0x82000040"
     },

From 2be25bdd415ffa8b12d3cd13798599971e660bef Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 10:43:24 +0200
Subject: [PATCH 5/6] fix(disasm): PPCBUG-641+649 sync/lwsync L-field
 discrimination
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPCBUG-641: PpcOpcode::sync emitted "sync" regardless of the L-field at
PPC bit 10. The Xbox 360 acquire barrier (encoding 0x7C2004AC, L=1) is
lwsync, used in every spinlock. The disassembly DB stored every lwsync
as `mnemonic='sync'`, so `SELECT WHERE mnemonic='lwsync'` returned zero
rows regardless of binary content.

PPCBUG-649 (companion): the golden fixture for lwsync had no ext_mnemonic
field, pinning the wrong output and defeating regression detection.

Fix: in disasm.rs, gate on `(instr.raw >> 21) & 1` (PPC bit 10) — when
set, emit the lwsync extended form. Update extended_mnemonics.json
fixture to expect `ext_mnemonic: "lwsync"`.

Note: this is the disassembler-side fix only. The interpreter-side
PPCBUG-088 (lwsync vs sync semantics) is separate.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/disasm.rs                        | 10 +++++++++-
 crates/xenia-cpu/tests/golden/extended_mnemonics.json |  4 +++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/crates/xenia-cpu/src/disasm.rs b/crates/xenia-cpu/src/disasm.rs
index 5a98981..352219c 100644
--- a/crates/xenia-cpu/src/disasm.rs
+++ b/crates/xenia-cpu/src/disasm.rs
@@ -361,7 +361,15 @@ pub fn format(instr: &DecodedInstr) -> DisasmText {
         PpcOpcode::dcbz  => fmt_cache(instr, "dcbz"),
         PpcOpcode::dcbz128 => fmt_cache(instr, "dcbz128"),
         PpcOpcode::icbi  => fmt_cache(instr, "icbi"),
-        PpcOpcode::sync  => base("sync", String::new(), 0),
+        PpcOpcode::sync  => {
+            // L-field at PPC bit 10 (host bit 21) selects lwsync (L=1), the
+            // acquire barrier in every Xbox 360 spinlock. PPCBUG-641.
+            if (instr.raw >> 21) & 1 == 1 {
+                with_ext("sync", String::new(), 0, "lwsync", String::new(), 0)
+            } else {
+                base("sync", String::new(), 0)
+            }
+        }
         PpcOpcode::eieio => base("eieio", String::new(), 0),
         PpcOpcode::isync => base("isync", String::new(), 0),
 
diff --git a/crates/xenia-cpu/tests/golden/extended_mnemonics.json b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
index 6749f94..0c442dd 100644
--- a/crates/xenia-cpu/tests/golden/extended_mnemonics.json
+++ b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
@@ -525,7 +525,9 @@
       "raw": "0x7C2004AC",
       "addr": "0x82000000",
       "mnemonic": "sync",
-      "operands": ""
+      "operands": "",
+      "ext_mnemonic": "lwsync",
+      "ext_operands": ""
     },
     {
       "label": "trap",

From 7609dcd406b17bbf3ceaab8a21f35f3a9b8c0d36 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 11:22:20 +0200
Subject: [PATCH 6/6] fix(cpu): PPCBUG-700 VMX128 register accessors match
 canary bitfield layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Independent review of P3 batch 2 (52ece4b) found that all three VMX128
register accessors disagreed with canary's FormatVX128/VX128_R bitfield
struct (`xenia-canary/src/xenia/cpu/ppc/ppc_decode_data.h:484-663`). The
audit at line 2958 had marked these "confirmed-clean" but had miscounted
LSB-first bitfield offsets.

Canary's actual layout (LSB-first, GCC/Clang/MSVC on x86):
  VA128 = VA128l(5) | VA128h(1)<<5 | VA128H(1)<<6
        = PPC[11:15] | PPC[26]<<5 | PPC[21]<<6  (7-bit selector, 3 fields)
  VB128 = VB128l(5) | VB128h(2)<<5
        = PPC[16:20] | PPC[30:31]<<5            (7-bit selector, 2 fields)
  VD128 = VD128l(5) | VD128h(2)<<5
        = PPC[6:10]  | PPC[28:29]<<5            (7-bit selector, 2 fields)
  VX128_R Rc = PPC[25]  (host bit 6)             not PPC[27] as prior fix had

The buggy convention was internally consistent with hand-crafted test
fixtures (which set bits 29/21/22 to encode the high registers, matching
the buggy accessor). Real Xbox 360 game code follows canary's convention,
so any production VMX128 instruction with VR >= 32 was silently mis-decoded
— but no unit test exercised that path until the va128 fix in 52ece4b
exposed the inconsistency.

Changes:
- decoder.rs: rewrite va128/vb128/vd128/vx128r_rc_bit to canary positions.
  Drop the speculative `key4_dt` dot-form dispatch in decode_op6 — canary
  has no separate dot-form opcodes for VX128_R compute ops; Rc is a
  runtime modifier read by the interpreter via vx128r_rc_bit().
- decoder.rs tests: rewrite vmx128_test_word helper for canary layout;
  rename/re-encode vmx128_vd128_*, vmx128_va128_*, vmx128_vb128_* tests.
- interpreter.rs: update encode_vpkd3d128 test helper to encode VD via
  canary's VD128h field; tests now pass vd=96 explicitly.
- tests/disasm_goldens.rs: replace the vrlimi128/vsrw128/vpermwi128/
  vperm128 hand-encoded raws with canary-compliant encodings; introduce
  a shared `encode_vx128` helper.
- tests/golden/vmx128_registers.json: re-encode 9 entries (vperm128,
  vsrw128 ×2, vpermwi128, vrlimi128 ×2, vmaddfp128, vmaddcfp128,
  vnmsubfp128) to canary-compliant raws preserving the same expected
  operand strings.
- audit-findings.md: new PPCBUG-700 entry documenting the discovery and
  invalidating the audit's "confirmed-clean" assessment.

Affects all VMX128 binary ops (vaddfp128, vsubfp128, vmulfp128, vand128,
vor128, vxor128, vnor128, vandc128, vsel128, vslo128, vsro128, vperm128,
vsrw128, vmaddfp128, vmaddcfp128, vnmsubfp128, vpkd3d128, vpkshss128,
vpkshus128, vpkswss128, vpkswus128, vpkuhum128, vpkuhus128, vpkuwum128,
vpkuwus128, vmsum3fp128, vmsum4fp128, vrlimi128, vpermwi128 — 30+
opcodes), plus VX128_R compare dot-forms.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 audit-findings.md                             |  35 +++++
 crates/xenia-cpu/src/decoder.rs               | 123 +++++++--------
 crates/xenia-cpu/src/interpreter.rs           |  32 ++--
 crates/xenia-cpu/tests/disasm_goldens.rs      | 140 +++++++++---------
 .../tests/golden/vmx128_registers.json        |  34 ++---
 5 files changed, 207 insertions(+), 157 deletions(-)

diff --git a/audit-findings.md b/audit-findings.md
index 26cbb97..5bb09cf 100644
--- a/audit-findings.md
+++ b/audit-findings.md
@@ -3414,3 +3414,38 @@ has the wrong extraction. The disassembler was written independently and got the
   degenerate cases.
 
 IDs PPCBUG-655 through PPCBUG-679 are unallocated — no further bugs found in Phase C3.
+
+---
+
+## Phase C4 — Post-merge audit corrections (2026-05-02)
+
+### PPCBUG-700 — VMX128 register accessors disagreed with canary's bitfield layout (HIGH)
+
+- **Severity**: HIGH (silent mis-decoding of any VMX128 instruction with a register >= 32)
+- **Status**: applied
+- **Locations**: `decoder.rs:138-160` (`va128`/`vb128`/`vd128`), `decoder.rs:80` (`vx128r_rc_bit`)
+- **Discovery**: independent reviewer of the P3 phase merge, comparing our rust accessors
+  against canary's `FormatVX128`/`VX128_2`/`VX128_4`/`VX128_5`/`VX128_R` bitfield struct
+  in `xenia-canary/src/xenia/cpu/ppc/ppc_decode_data.h:484-663`.
+- **Symptom**: this entry contradicts the audit's own line 2958 ("confirmed-clean")
+  assessment. The previous audit miscounted bit-field offsets — under x86_64 LSB-first
+  C++ bitfield packing, the canary fields land at:
+    - `VA128 = VA128l(5) | VA128h(1)<<5 | VA128H(1)<<6` = PPC[11-15] | PPC[26]<<5 | PPC[21]<<6 (3 fields, 7 bits)
+    - `VB128 = VB128l(5) | VB128h(2)<<5` = PPC[16-20] | PPC[30-31]<<5 (2 fields, 7 bits)
+    - `VD128 = VD128l(5) | VD128h(2)<<5` = PPC[6-10] | PPC[28-29]<<5 (2 fields, 7 bits)
+    - `Rc` (VX128_R only) = PPC[25] (host bit 6) — not PPC[27] as PPCBUG-422/562 prescribed.
+  Rust code instead used va128: PPC[11-15] | PPC[29]<<5 (one bit, wrong position); vb128:
+  PPC[16-20] | PPC[28]<<5 | PPC[30]<<6 (wrong positions); vd128: PPC[6-10] | PPC[21]<<5 |
+  PPC[22]<<6 (wrong positions); vx128r_rc_bit at PPC[27].
+- **Why it lurked**: the buggy convention was internally consistent with hand-crafted
+  test fixtures (which set bit 29 / 21 / 22 to encode "high" registers, matching the
+  buggy accessor). Real Xbox 360 game code follows canary's convention, so any production
+  encoding with VR >= 32 was silently mis-decoded — but no unit test exercised that path.
+- **Fix**: rewrite the four accessors to canary's bit positions; rewrite the
+  `vmx128_test_word` helper and unit tests; re-encode the goldens for vmaddfp128/
+  vmaddcfp128/vnmsubfp128/vperm128/vsrw128/vpermwi128/vrlimi128. Drop the speculative
+  `key4_dt` dot-form dispatch in `decode_op6` (canary has no separate dot-form opcodes
+  for VX128_R compute ops; Rc is a runtime modifier). Update `encode_vpkd3d128` test
+  helper for canary's VD128h placement.
+- **Cross-reference**: invalidates the audit's confirmed-clean note at line 2958.
+  Subsumes the partial fix-shape proposed in PPCBUG-422 (Rc-bit position).
diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index d4d8ac1..9790f47 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -77,7 +77,9 @@ impl DecodedInstr {
     /// Rc for VC-form vector compare instructions — PPC bit 21 = host bit 10.
     #[inline] pub fn vc_rc_bit(&self) -> bool { (self.raw >> 10) & 1 != 0 }
     /// Rc for VX128_R-form vector compare instructions — PPC bit 27 = host bit 4.
-    #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 4) & 1 != 0 }
+    /// VX128_R Rc bit — PPC bit 25 (host bit 6) per canary's FormatVX128_R
+    /// bitfield layout. PPCBUG-700.
+    #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 6) & 1 != 0 }
 
     /// IMM field for VX128_4-form instructions (vrlimi128) — 5-bit blend mask at PPC bits 11-15.
     #[inline] pub fn vx128_4_imm(&self) -> u32 { extract_bits(self.raw, 11, 15) }
@@ -133,25 +135,30 @@ impl DecodedInstr {
     /// crbB (bits 16-20)
     #[inline] pub fn crbb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
 
-    // VMX128 field extractors
+    // VMX128 field extractors — bit positions match canary's
+    // FormatVX128/VX128_2/VX128_4/VX128_5/VX128_R bitfield layout
+    // (xenia-canary `ppc_decode_data.h:484-663`, LSB-first packed). PPCBUG-700.
 
-    /// VA128 (bits 11-15, plus bit from 29)
+    /// VA128 = VA128l(5) | VA128h(1) << 5 | VA128H(1) << 6.
+    /// Canonical 7-bit register selector: PPC 11-15 (low), PPC 26 (mid), PPC 21 (high).
     #[inline] pub fn va128(&self) -> usize {
-        (extract_bits(self.raw, 11, 15) | (extract_bits(self.raw, 29, 29) << 5)) as usize
+        (extract_bits(self.raw, 11, 15)
+            | (extract_bits(self.raw, 26, 26) << 5)
+            | (extract_bits(self.raw, 21, 21) << 6)) as usize
     }
 
-    /// VB128 (bits 16-20, plus bits from 28, 30)
+    /// VB128 = VB128l(5) | VB128h(2) << 5. Canary's VB128h is a 2-bit
+    /// contiguous field at PPC 30-31 (host bits 0-1).
     #[inline] pub fn vb128(&self) -> usize {
         (extract_bits(self.raw, 16, 20)
-            | (extract_bits(self.raw, 28, 28) << 5)
-            | (extract_bits(self.raw, 30, 30) << 6)) as usize
+            | (extract_bits(self.raw, 30, 31) << 5)) as usize
     }
 
-    /// VD128 (bits 6-10, plus bits from 21, 22)
+    /// VD128 = VD128l(5) | VD128h(2) << 5. Canary's VD128h is a 2-bit
+    /// contiguous field at PPC 28-29 (host bits 2-3).
     #[inline] pub fn vd128(&self) -> usize {
         (extract_bits(self.raw, 6, 10)
-            | (extract_bits(self.raw, 21, 21) << 5)
-            | (extract_bits(self.raw, 22, 22) << 6)) as usize
+            | (extract_bits(self.raw, 28, 29) << 5)) as usize
     }
 
     /// VS128 - same encoding as VD128
@@ -664,11 +671,11 @@ fn decode_op6(code: u32) -> PpcOpcode {
         _ => {}
     }
 
-    // VMX128 compare (non-dot and dot forms).
-    // Non-dot: bit 27 = 0.  Dot: bit 27 = 1, but bit 25 must also be 0 to
-    // distinguish from the shift/merge group (which has bit 25 = 1 when bit 27 = 1).
-    // key4_nd uses bits 22-24 + bit 27 (same as original, covers non-dot).
-    // key4_dt uses bits 22-24 + bit 25 + bit 27 (narrower, covers dot-only).
+    // VMX128 compare (VX128_R form). Single dispatch path: bit 27 = 0 always
+    // for these opcodes per canary's table (`ppc_opcode_table_gen.cc:295-305`).
+    // The Rc bit is at PPC 25 (host bit 6) per the FormatVX128_R bitfield —
+    // it's a runtime modifier read by the interpreter, NOT part of the
+    // secondary-opcode discrimination. PPCBUG-700.
     let key4_nd = (extract_bits(code, 22, 24) << 3) | extract_bits(code, 27, 27);
     match key4_nd {
         0b000000 => return PpcOpcode::vcmpeqfp128,
@@ -678,16 +685,6 @@ fn decode_op6(code: u32) -> PpcOpcode {
         0b100000 => return PpcOpcode::vcmpequw128,
         _ => {}
     }
-    // Dot forms: bit 27 = 1, bit 25 = 0  (key = bits22-24 + bit25 + bit27, low 3 bits)
-    let key4_dt = (extract_bits(code, 22, 24) << 2) | (extract_bits(code, 25, 25) << 1) | extract_bits(code, 27, 27);
-    match key4_dt {
-        0b00001 => return PpcOpcode::vcmpeqfp128,   // bits22-24=000, bit25=0, bit27=1
-        0b00101 => return PpcOpcode::vcmpgefp128,   // bits22-24=001, bit25=0, bit27=1
-        0b01001 => return PpcOpcode::vcmpgtfp128,   // bits22-24=010, bit25=0, bit27=1
-        0b01101 => return PpcOpcode::vcmpbfp128,    // bits22-24=011, bit25=0, bit27=1
-        0b10001 => return PpcOpcode::vcmpequw128,   // bits22-24=100, bit25=0, bit27=1
-        _ => {}
-    }
 
     // VMX128 shift/merge
     let key5 = (extract_bits(code, 22, 25) << 2) | extract_bits(code, 27, 27);
@@ -1055,21 +1052,26 @@ mod tests {
     // used those extractors). Each test poke-bits exactly the slots the
     // accessor reads and asserts the assembled register number.
 
-    fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32,
-                       vb16_20: u32, vb28: u32, vb30: u32) -> u32 {
-        // PPC bit i -> LSB position 31-i.
-        (vd6_10 << (31 - 10))
-            | (va21  << (31 - 21))   // va128 high bit at PPC 29 in some forms — kept 0 here
-            | (vd21  << (31 - 21))
-            | (vd22  << (31 - 22))
-            | (vb16_20 << (31 - 20))
-            | (vb28  << (31 - 28))
-            | (vb30  << (31 - 30))
+    /// Build a VMX128 test word for the canary-compliant register layout.
+    /// `vd128 = vd_lo | (vd_hi << 5)` where vd_lo is 5 bits (PPC 6-10) and
+    /// vd_hi is 2 bits (PPC 28-29). Same shape for vb128 (vb_lo at PPC 16-20,
+    /// vb_hi 2 bits at PPC 30-31). va128 = va_lo | (va_h26<<5) | (va_h21<<6)
+    /// per canary's 7-bit VA selector.
+    fn vmx128_test_word(vd_lo: u32, vd_hi: u32, va_lo: u32, va_h26: u32, va_h21: u32,
+                       vb_lo: u32, vb_hi: u32) -> u32 {
+        // PPC bit i -> host bit (31-i).
+        (vd_lo << (31 - 10))           // VD128l: PPC 6-10 = host 21-25
+            | (vd_hi << (31 - 29))      // VD128h: PPC 28-29 = host 2-3 (LSB at host 2)
+            | (va_lo << (31 - 15))      // VA128l: PPC 11-15 = host 16-20
+            | (va_h26 << (31 - 26))     // VA128h: PPC 26 = host 5
+            | (va_h21 << (31 - 21))     // VA128H: PPC 21 = host 10
+            | (vb_lo << (31 - 20))      // VB128l: PPC 16-20 = host 11-15
+            | (vb_hi << (31 - 31))      // VB128h: PPC 30-31 = host 0-1 (LSB at host 0)
     }
 
     #[test]
     fn vmx128_vd128_low_5_bits_only() {
-        // vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo
+        // vd_lo = 0..31, vd_hi = 0 → vd128 = vd_lo
         for r in 0..32u32 {
             let raw = (r as u32) << (31 - 10);
             let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
@@ -1078,45 +1080,51 @@ mod tests {
     }
 
     #[test]
-    fn vmx128_vd128_bit21_adds_32() {
-        // vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32
-        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21));
+    fn vmx128_vd128_high_low_bit_adds_32() {
+        // vd_lo = 0, VD128h = 0b01 (LSB only at host bit 2 = PPC 29) → vd128 = 32
+        let raw = (1u32 << (31 - 29));
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.vd128(), 32);
     }
 
     #[test]
-    fn vmx128_vd128_bit22_adds_64() {
-        // vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64
-        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22));
+    fn vmx128_vd128_high_high_bit_adds_64() {
+        // vd_lo = 0, VD128h = 0b10 (MSB only at host bit 3 = PPC 28) → vd128 = 64
+        let raw = (1u32 << (31 - 28));
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.vd128(), 64);
     }
 
     #[test]
     fn vmx128_vd128_full_127() {
-        // vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127
+        // vd_lo = 31, VD128h = 0b11 → vd128 = 127
         let raw = (31u32 << (31 - 10))
-            | (1u32 << (31 - 21))
-            | (1u32 << (31 - 22));
+            | (1u32 << (31 - 28))
+            | (1u32 << (31 - 29));
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.vd128(), 127);
     }
 
     #[test]
-    fn vmx128_va128_uses_bit29() {
-        // va128 = bits 11-15 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
-        let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 29));
+    fn vmx128_va128_canary_layout() {
+        // va_lo = 7 at PPC 11-15, VA128h = 1 at PPC 26 → va128 = 7 | 32 = 39
+        let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 26));
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.va128(), 39);
+        // VA128H = 1 at PPC 21 → va128 += 64 = 103
+        let raw = raw | (1u32 << (31 - 21));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.va128(), 7 | 32 | 64);
     }
 
     #[test]
-    fn vmx128_vb128_uses_bits28_and_30() {
-        // vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64.
-        let raw = (5u32 << (31 - 20))
-            | (1u32 << (31 - 28))
-            | (1u32 << (31 - 30));
+    fn vmx128_vb128_uses_bits30_31() {
+        // vb_lo = 5 at PPC 16-20. VB128h = 0b01 (LSB at PPC 31 = host 0) → +32.
+        // VB128h = 0b11 → +96.
+        let raw = (5u32 << (31 - 20)) | (1u32 << (31 - 31));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vb128(), 5 | 32);
+        let raw = raw | (1u32 << (31 - 30));
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.vb128(), 5 | 32 | 64);
     }
@@ -1126,11 +1134,9 @@ mod tests {
         // vs128 must always equal vd128.
         for r in [0u32, 31, 32, 64, 96, 127] {
             let lo = r & 0x1F;
-            let b21 = (r >> 5) & 1;
-            let b22 = (r >> 6) & 1;
+            let hi = (r >> 5) & 0x3;
             let raw = (lo << (31 - 10))
-                | (b21 << (31 - 21))
-                | (b22 << (31 - 22));
+                | (hi << (31 - 29));
             let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
             assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}");
             assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}");
@@ -1142,7 +1148,8 @@ mod tests {
     #[allow(dead_code)]
     fn _vmx128_test_word_helper_compiles() {
         // Keep the helper validated against the real accessor.
-        let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0);
+        // vd_lo=5, vd_hi=0b11 → vd128 = 5 | 96 = 101
+        let raw = vmx128_test_word(5, 3, 0, 0, 0, 0, 0);
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.vd128(), 5 | 32 | 64);
     }
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 09800b8..19fa865 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -6484,13 +6484,25 @@ mod tests {
     // For vpkd3d128, PPC bits 21-22 are always 1 (key2 discriminant), so
     // vd128 is always in range [96, 127] for vd_lo in [0, 31].
 
-    fn encode_vpkd3d128(vd_lo: u32, vb_lo: u32, imm: u32, z: u32) -> u32 {
-        // op6=6 (all VMX128 compute ops); VD[4:0] at host 25-21; IMM at host 20-16;
-        // VB[4:0] at host 15-11; host bits 10,9 = 1,1 (PPC bits 21,22, key2=0b110);
-        // z (2-bit) at host 7-6 (PPC bits 24-25); host bit 4 = 1 (PPC bit 27, key2 low=0b01).
-        // decode_op6 key2 = (bits21-23<<4)|(bits26-27) = (0b110<<4)|0b01 = 0b1100001.
-        (6u32 << 26) | (vd_lo << 21) | (imm << 16) | (vb_lo << 11)
-            | (1 << 10) | (1 << 9) | (z << 6) | (1 << 4)
+    fn encode_vpkd3d128(vd: u32, vb_lo: u32, imm: u32, z: u32) -> u32 {
+        // op6=6, FormatVX128_4 layout (canary):
+        //   VD low at PPC 6-10 (host 21-25); VD high (2 bits) at PPC 28-29 (host 2-3).
+        //   IMM at PPC 11-15; VB low at PPC 16-20.
+        //   z (2-bit) at PPC 24-25 (host 6-7).
+        //   key2 = 0b1100001 over bits 21-23 + 26-27:
+        //     bits 21-23 = 0b110 → bit 21=1, bit 22=1, bit 23=0
+        //     bits 26-27 = 0b01  → bit 26=0, bit 27=1
+        let vd_lo = vd & 0x1F;
+        let vd_hi = (vd >> 5) & 0x3;
+        (6u32 << 26)
+            | (vd_lo << 21)
+            | (vd_hi << 2)
+            | (imm << 16)
+            | (vb_lo << 11)
+            | (1 << 10)        // bit 21 (key2)
+            | (1 << 9)         // bit 22 (key2)
+            | (z << 6)         // z at PPC 24-25
+            | (1 << 4)         // bit 27 (key2)
     }
 
     #[test]
@@ -6504,7 +6516,7 @@ mod tests {
         ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0);
         // prev vd=96: sentinel values that should NOT appear in result
         ctx.vr[96] = xenia_types::Vec128::from_u32x4(0xAAAA_AAAA, 0xBBBB_BBBB, 0xCCCC_CCCC, 0xDDDD_DDDD);
-        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 0, 0));
+        write_instr(&mem, 0, encode_vpkd3d128(96, 1, 0, 0));
         ctx.pc = 0;
         step(&mut ctx, &mem);
         let r = ctx.vr[96].as_u32x4();
@@ -6524,7 +6536,7 @@ mod tests {
         let mem = TestMem::new();
         ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000
         ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
-        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 0));
+        write_instr(&mem, 0, encode_vpkd3d128(96, 1, 1, 0));
         ctx.pc = 0;
         step(&mut ctx, &mem);
         let r = ctx.vr[96].as_u32x4();
@@ -6543,7 +6555,7 @@ mod tests {
         let mem = TestMem::new();
         ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000
         ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
-        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 3));
+        write_instr(&mem, 0, encode_vpkd3d128(96, 1, 1, 3));
         ctx.pc = 0;
         step(&mut ctx, &mem);
         let r = ctx.vr[96].as_u32x4();
diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs
index 2c301a5..ec617a0 100644
--- a/crates/xenia-cpu/tests/disasm_goldens.rs
+++ b/crates/xenia-cpu/tests/disasm_goldens.rs
@@ -50,6 +50,23 @@ fn fixture_path(name: &str) -> PathBuf {
         .join(name)
 }
 
+/// Encode a VMX128 VX128-form (or VX128_R/_2) instruction with canary's
+/// 7-bit register layout: VD low at PPC 6-10, high 2 bits at PPC 28-29;
+/// VA low at PPC 11-15, mid bit at PPC 26, high bit at PPC 21; VB low at
+/// PPC 16-20, high 2 bits at PPC 30-31. `secondary_bits` carries any
+/// secondary opcode + VC + Rc + key bits the caller needs.
+fn encode_vx128(op6: u32, vd: u32, va: u32, vb: u32, secondary_bits: u32) -> u32 {
+    ((op6 & 0x3F) << 26)
+        | ((vd & 0x1F) << 21)
+        | (((vd >> 5) & 0x3) << 2)
+        | ((va & 0x1F) << 16)
+        | (((va >> 5) & 0x1) << 5)
+        | (((va >> 6) & 0x1) << 10)
+        | ((vb & 0x1F) << 11)
+        | (((vb >> 5) & 0x3) << 0)
+        | secondary_bits
+}
+
 fn build_rows(cases: &[(u32, u32, &str)]) -> Vec<GoldenRow> {
     cases
         .iter()
@@ -428,77 +445,57 @@ fn vmx128_registers() {
         ((4u32 << 26) | (5 << 11) | 1604, 0x82000000, "mtvscr v5"),
     ];
 
-    // VMX128 op=5 — uses vd128/va128/vb128 (7-bit registers, high bits at
-    // 21+22). These are the silent-bug-area encodings; we exercise low
-    // register indices here because the secondary-opcode key for op=5
-    // includes bits 21-22, constraining vd128 high bits to 0 in this form.
-    // High-index examples for vd128 live in the op=6 series below.
+    // VMX128 op=5: vperm128 v3, v4, v5, vc=0. Canary FormatVX128: VD low
+    // at PPC 6-10, VA low at PPC 11-15, VB low at PPC 16-20, VC at PPC 23-25.
+    // key1 = (bit22<<5)|bit27 = 0 selects vperm128.
     let vmx128_op5 = [
-        // vaddfp128 v3, v4, v5   : op=5, key2=0b000001
-        ((5u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (0 << 6) | (1 << 0), 0x82000000, "vaddfp128 (encoded sloppily)"),
+        (encode_vx128(5, 3, 4, 5, 0), 0x82000000, "vperm128 v3, v4, v5, 0 (canary)"),
     ];
 
-    // VMX128 op=6 — vrlimi128 has secondary key in bits 23-25 + 26-27, so
-    // bits 21-22 ARE the high bits of vd128 (canonical silent-bug-area).
-    // These instructions exercise vd128 = 32, 64, 96 — covering the bit-21
-    // and bit-22 split that ppc.rs's old extractor (now deleted) miscoded.
-    let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 {
-        // op=6, vd128 = bits 6-10 + bit 21 + bit 22, vb128 = bits 16-20 + bits 30+31,
-        //  IMM = bits 11-15, Z = bits 24-25, key2 = (bits 23-25 << 4) | bits 26-27 = 0b1110001
-        let vd_lo = vd & 0x1F;
-        let vd_b21 = (vd >> 5) & 1;
-        let vd_b22 = (vd >> 6) & 1;
-        let vb_lo = vb & 0x1F;
-        let vb_b30 = (vb >> 5) & 1;
-        let vb_b31 = (vb >> 6) & 1;
-        // bits 23-25 = 111, bits 26-27 = 00, bit 27 = 1 → key2 lower 4 bits = 0001
-        // Encoded: bits 23-25 = 111, bits 26-27 = 00 are actually overlapping with z field (bits 24-25)
-        // The plan view: (bits 23 << 6) | (bits 24-25 << 4) | (bits 26-27 << 2) but the table uses different.
-        // Easiest: hand-encode known bit pattern matching decoder.rs's match:
-        //   key2 = (extract_bits(code, 23, 25) << 4) | extract_bits(code, 26, 27) = 0b1110001
-        //   bits 23-25 = 111, bits 26-27 = 01
-        // Bit positions 23-27 = 11101 (5 bits, MSB at 23).
-        //   PPC bit 23 (LSB index 8): set
-        //   PPC bit 24 (LSB index 7): set  -- this is z bit 0
-        //   PPC bit 25 (LSB index 6): set  -- this is z bit 1
-        //   PPC bit 26 (LSB index 5): unset
-        //   PPC bit 27 (LSB index 4): set
-        // We let z = bits 24-25 stored with vd128 bits at 21-22.
-        // To preserve key2 = 0b1110001, we need bits 24-25 = 11, bit 26 = 0, bit 27 = 1.
-        // BUT bits 24-25 ARE the z field; if we set them = 11 the z value is 3.
-        // So Z is constrained for vrlimi128. Choose Z = 3 (matches Sylpheed examples).
-        let z3 = z & 0x3;
-        (6u32 << 26)
-            | (vd_lo << 21)
-            | (imm << 16)
-            | (vb_lo << 11)
-            | (vd_b21 << 10)  // bit 21 (LSB pos 10)
-            | (vd_b22 << 9)   // bit 22 (LSB pos 9)
-            | (1 << 8)        // bit 23
-            | (z3 << 6)       // bits 24-25
-            | (0 << 5)        // bit 26
-            | (1 << 4)        // bit 27
-            | (vb_b30 << 1)   // bit 30
-            | vb_b31          // bit 31
+    // VMX128 op=6 — exercise full 0-127 vd128 range under canary's layout.
+    // VD128h is at PPC 28-29 (host 2-3): no overlap with secondary opcode key,
+    // so vd can be freely 0-127 for any op6 instruction.
+    let vsrw128 = |vd: u32, vb: u32| -> u32 {
+        // vsrw128 secondary: 0x000001D0 (decode_op6 key5 = 0b011101).
+        encode_vx128(6, vd, 0, vb, 0x000001D0)
+    };
+    let vpermwi128 = |vd: u32, vb: u32, perm: u32| -> u32 {
+        // vpermwi128: PERMl at PPC 11-15, PERMh at PPC 23-25, key1 sets bit 22 + bit 27.
+        let perml = perm & 0x1F;
+        let permh = (perm >> 5) & 0x7;
+        let mut raw = (6u32 << 26)
+            | ((vd & 0x1F) << 21)
+            | (((vd >> 5) & 0x3) << 2)  // VD128h
+            | (perml << 16)
+            | ((vb & 0x1F) << 11)
+            | (((vb >> 5) & 0x3) << 0)  // VB128h
+            | (permh << 6)              // PERMh at PPC 23-25
+            | (1 << 9)                  // bit 22 (key1 high)
+            | (1 << 4);                 // bit 27 (key1 low)
+        raw &= !(1 << 10); // PPC 21 = 0 for vpermwi128
+        raw
+    };
+    let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 {
+        // vrlimi128: IMM at PPC 11-15, z at PPC 24-25, key2 = 0b1110001 over
+        // bits 21-23 + 26-27 → bits 21,22,23 = 1, bit 26 = 0, bit 27 = 1.
+        (6u32 << 26)
+            | ((vd & 0x1F) << 21)
+            | (((vd >> 5) & 0x3) << 2)   // VD128h
+            | ((imm & 0x1F) << 16)
+            | ((vb & 0x1F) << 11)
+            | (((vb >> 5) & 0x3) << 0)   // VB128h
+            | ((z & 0x3) << 6)           // z at PPC 24-25 = host 6-7
+            | (1 << 8)                   // bit 23 (key2)
+            | (1 << 9)                   // bit 22 (key2)
+            | (1 << 10)                  // bit 21 (key2)
+            | (1 << 4)                   // bit 27 (key2)
     };
-    // Note: VMX128 op6 secondary keys constrain bits 21-23. For
-    // vrlimi128 (key2 = 0b1110001 over bits 21-23 + 26-27) the only
-    // valid vd128 range is 96..=127 — lower values change the secondary
-    // key into some other instruction. The cases below record what the
-    // disassembler emits for the borderline encodings, so a regression
-    // in either the lookup table or the formatter would surface here.
     let vmx128_high = [
-        // bits 21-22 = 00 → key2 ≠ vrlimi128 → decodes to vsrw128 (key5
-        // branch). Locks current behavior; shows the silent-bug-area
-        // encoding constraint.
-        (vrlimi128(0, 12, 4, 3), 0x82000000, "encoding vd_hi=00: actually vsrw128"),
-        // bits 21-22 = 10 → still not vrlimi128.
-        (vrlimi128(32, 12, 4, 3), 0x82000000, "encoding vd_hi=10: actually vsrw128 v32"),
-        // bits 21-22 = 01 → key1 matches vpermwi128.
-        (vrlimi128(64, 12, 4, 3), 0x82000000, "encoding vd_hi=01: actually vpermwi128"),
-        // bits 21-22 = 11 → key2 matches vrlimi128 with vd128=96.
-        (vrlimi128(96, 12, 4, 3), 0x82000000, "vrlimi128 v96, v12, 4, 3 (real)"),
-        (vrlimi128(127, 127, 4, 3), 0x82000000, "vrlimi128 v127, v127, 4, 3 (real)"),
+        (vsrw128(0, 12),               0x82000000, "vsrw128 v0, v0, v12 (canary, vd_hi=00)"),
+        (vsrw128(32, 12),              0x82000000, "vsrw128 v32, v0, v12 (canary, VD128h=01)"),
+        (vpermwi128(64, 12, 0xE4),     0x82000000, "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)"),
+        (vrlimi128(96, 12, 4, 3),      0x82000000, "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)"),
+        (vrlimi128(127, 95, 4, 3),     0x82000000, "vrlimi128 v127, v95, 4, 3 (canary)"),
     ];
 
     // Fix 4: VMX128 multiply-add 4-operand layouts. Per canary, the addend
@@ -514,12 +511,11 @@ fn vmx128_registers() {
     //     vmaddcfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
     //     vnmsubfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
     let vmx128_4op = [
-        // vmaddfp128: vd=3(bits 6-10), va=35(bits 11-15=3 + bit29=1), vb=5(bits 16-20), key2=0b001101
-        (0x146328D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
-        // vmaddcfp128: same vd/va/vb layout, key2=0b010001
-        (0x14632914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
-        // vnmsubfp128: same vd/va/vb layout, key2=0b010101
-        (0x14632954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
+        // Canary FormatVX128 layout: vd=3 (PPC 6-10), va=35 (low 3 at PPC 11-15 + VA128h=1 at PPC 26),
+        // vb=5 (PPC 16-20), key2 at PPC 22-25 + bit 27.
+        (0x146328F0u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
+        (0x14632930u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
+        (0x14632970u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
     ];
 
     let mut all = Vec::new();
diff --git a/crates/xenia-cpu/tests/golden/vmx128_registers.json b/crates/xenia-cpu/tests/golden/vmx128_registers.json
index 6d4f8b0..12cf44d 100644
--- a/crates/xenia-cpu/tests/golden/vmx128_registers.json
+++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json
@@ -71,64 +71,64 @@
       "operands": "v5"
     },
     {
-      "label": "vaddfp128 (encoded sloppily)",
-      "raw": "0x14642801",
+      "label": "vperm128 v3, v4, v5, 0 (canary)",
+      "raw": "0x14642800",
       "addr": "0x82000000",
       "mnemonic": "vperm128",
       "operands": "v3, v4, v5, 0"
     },
     {
-      "label": "encoding vd_hi=00: actually vsrw128",
-      "raw": "0x180461D0",
+      "label": "vsrw128 v0, v0, v12 (canary, vd_hi=00)",
+      "raw": "0x180061D0",
       "addr": "0x82000000",
       "mnemonic": "vsrw128",
-      "operands": "v0, v4, v12"
+      "operands": "v0, v0, v12"
     },
     {
-      "label": "encoding vd_hi=10: actually vsrw128 v32",
-      "raw": "0x180465D0",
+      "label": "vsrw128 v32, v0, v12 (canary, VD128h=01)",
+      "raw": "0x180061D4",
       "addr": "0x82000000",
       "mnemonic": "vsrw128",
-      "operands": "v32, v4, v12"
+      "operands": "v32, v0, v12"
     },
     {
-      "label": "encoding vd_hi=01: actually vpermwi128",
-      "raw": "0x180463D0",
+      "label": "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)",
+      "raw": "0x180463D8",
       "addr": "0x82000000",
       "mnemonic": "vpermwi128",
       "operands": "v64, v12, 0xE4"
     },
     {
-      "label": "vrlimi128 v96, v12, 4, 3 (real)",
-      "raw": "0x180467D0",
+      "label": "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)",
+      "raw": "0x180467DC",
       "addr": "0x82000000",
       "mnemonic": "vrlimi128",
       "operands": "v96, v12, 4, 3"
     },
     {
-      "label": "vrlimi128 v127, v127, 4, 3 (real)",
-      "raw": "0x1BE4FFD3",
+      "label": "vrlimi128 v127, v95, 4, 3 (canary)",
+      "raw": "0x1BE4FFDE",
       "addr": "0x82000000",
       "mnemonic": "vrlimi128",
       "operands": "v127, v95, 4, 3"
     },
     {
       "label": "vmaddfp128 v3, v35, v5, v3",
-      "raw": "0x146328D4",
+      "raw": "0x146328F0",
       "addr": "0x82000000",
       "mnemonic": "vmaddfp128",
       "operands": "v3, v35, v5, v3"
     },
     {
       "label": "vmaddcfp128 v3, v35, v3, v5",
-      "raw": "0x14632914",
+      "raw": "0x14632930",
       "addr": "0x82000000",
       "mnemonic": "vmaddcfp128",
       "operands": "v3, v35, v3, v5"
     },
     {
       "label": "vnmsubfp128 v3, v35, v3, v5",
-      "raw": "0x14632954",
+      "raw": "0x14632970",
       "addr": "0x82000000",
       "mnemonic": "vnmsubfp128",
       "operands": "v3, v35, v3, v5"