Merge branch 'ppc-audit-fix/p3-isolated-high' — Phase 3 isolated HIGH bugs

Phase 3 of the PPC instruction audit fix application. Six commits land six independent (or coupled) PPCBUG fixes: - cedee3c: PPCBUG-510 stvewx128 16-byte corruption → 4-byte word write - 52ece4b: PPCBUG-424+425 vmaddfp128/vmaddcfp128 operand swap (VA*VD+VB) - 3d8e2ce: PPCBUG-053+054 32-bit CTR semantics in bcx/bclrx + mtspr CTR - d4f6ea7: PPCBUG-640+650 fmt_bc spurious bdnzge/bdzge condition suffix - 2be25bd: PPCBUG-641+649 sync vs lwsync L-field disambiguation - 7609dcd: PPCBUG-700 VMX128 register accessors → canary bitfield layout PPCBUG-700 was a discovery during phase end-to-end review: an independent reviewer cross-checked our va128/vb128/vd128/vx128r_rc_bit accessors against canary's `FormatVX128*` bitfield struct (xenia-canary `ppc_decode_data.h:484-663`) and found the bit positions were wrong on all four. The audit's line-2958 "confirmed-clean" assessment was based on a miscount of LSB-first packed C++ bitfields. Real Xbox 360 game code follows canary's convention, so any production VMX128 instruction with register VR >= 32 was silently mis-decoded — though no unit test exercised that path until 52ece4b's operand-swap fix exposed the inconsistency. Subsumes PPCBUG-422's prescribed Rc-bit position. Verification at merge: `cargo test --workspace --release` clean across all crates; targeted vmx128/decoder/disasm-golden tests green. Acid test (`-n 4B --parallel`) deferred to end-of-all-phases per user direction.
2026-05-02 11:22:54 +02:00
parent a8c918cf9e 7609dcd406
commit f3ebaba5c9
7 changed files with 389 additions and 194 deletions
--- a/audit-findings.md
+++ b/audit-findings.md
@@ -3414,3 +3414,38 @@ has the wrong extraction. The disassembler was written independently and got the
  degenerate cases.

 IDs PPCBUG-655 through PPCBUG-679 are unallocated — no further bugs found in Phase C3.
+
+---
+
+## Phase C4 — Post-merge audit corrections (2026-05-02)
+
+### PPCBUG-700 — VMX128 register accessors disagreed with canary's bitfield layout (HIGH)
+
+- **Severity**: HIGH (silent mis-decoding of any VMX128 instruction with a register >= 32)
+- **Status**: applied
+- **Locations**: `decoder.rs:138-160` (`va128`/`vb128`/`vd128`), `decoder.rs:80` (`vx128r_rc_bit`)
+- **Discovery**: independent reviewer of the P3 phase merge, comparing our rust accessors
+  against canary's `FormatVX128`/`VX128_2`/`VX128_4`/`VX128_5`/`VX128_R` bitfield struct
+  in `xenia-canary/src/xenia/cpu/ppc/ppc_decode_data.h:484-663`.
+- **Symptom**: this entry contradicts the audit's own line 2958 ("confirmed-clean")
+  assessment. The previous audit miscounted bit-field offsets — under x86_64 LSB-first
+  C++ bitfield packing, the canary fields land at:
+    - `VA128 = VA128l(5) | VA128h(1)<<5 | VA128H(1)<<6` = PPC[11-15] | PPC[26]<<5 | PPC[21]<<6 (3 fields, 7 bits)
+    - `VB128 = VB128l(5) | VB128h(2)<<5` = PPC[16-20] | PPC[30-31]<<5 (2 fields, 7 bits)
+    - `VD128 = VD128l(5) | VD128h(2)<<5` = PPC[6-10] | PPC[28-29]<<5 (2 fields, 7 bits)
+    - `Rc` (VX128_R only) = PPC[25] (host bit 6) — not PPC[27] as PPCBUG-422/562 prescribed.
+  Rust code instead used va128: PPC[11-15] | PPC[29]<<5 (one bit, wrong position); vb128:
+  PPC[16-20] | PPC[28]<<5 | PPC[30]<<6 (wrong positions); vd128: PPC[6-10] | PPC[21]<<5 |
+  PPC[22]<<6 (wrong positions); vx128r_rc_bit at PPC[27].
+- **Why it lurked**: the buggy convention was internally consistent with hand-crafted
+  test fixtures (which set bit 29 / 21 / 22 to encode "high" registers, matching the
+  buggy accessor). Real Xbox 360 game code follows canary's convention, so any production
+  encoding with VR >= 32 was silently mis-decoded — but no unit test exercised that path.
+- **Fix**: rewrite the four accessors to canary's bit positions; rewrite the
+  `vmx128_test_word` helper and unit tests; re-encode the goldens for vmaddfp128/
+  vmaddcfp128/vnmsubfp128/vperm128/vsrw128/vpermwi128/vrlimi128. Drop the speculative
+  `key4_dt` dot-form dispatch in `decode_op6` (canary has no separate dot-form opcodes
+  for VX128_R compute ops; Rc is a runtime modifier). Update `encode_vpkd3d128` test
+  helper for canary's VD128h placement.
+- **Cross-reference**: invalidates the audit's confirmed-clean note at line 2958.
+  Subsumes the partial fix-shape proposed in PPCBUG-422 (Rc-bit position).
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -77,7 +77,9 @@ impl DecodedInstr {
    /// Rc for VC-form vector compare instructions — PPC bit 21 = host bit 10.
    #[inline] pub fn vc_rc_bit(&self) -> bool { (self.raw >> 10) & 1 != 0 }
    /// Rc for VX128_R-form vector compare instructions — PPC bit 27 = host bit 4.
-    #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 4) & 1 != 0 }
+    /// VX128_R Rc bit — PPC bit 25 (host bit 6) per canary's FormatVX128_R
+    /// bitfield layout. PPCBUG-700.
+    #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 6) & 1 != 0 }

    /// IMM field for VX128_4-form instructions (vrlimi128) — 5-bit blend mask at PPC bits 11-15.
    #[inline] pub fn vx128_4_imm(&self) -> u32 { extract_bits(self.raw, 11, 15) }
@@ -133,25 +135,30 @@ impl DecodedInstr {
    /// crbB (bits 16-20)
    #[inline] pub fn crbb(&self) -> u32 { extract_bits(self.raw, 16, 20) }

-    // VMX128 field extractors
+    // VMX128 field extractors — bit positions match canary's
+    // FormatVX128/VX128_2/VX128_4/VX128_5/VX128_R bitfield layout
+    // (xenia-canary `ppc_decode_data.h:484-663`, LSB-first packed). PPCBUG-700.

-    /// VA128 (bits 6-10, plus bit from 29)
+    /// VA128 = VA128l(5) | VA128h(1) << 5 | VA128H(1) << 6.
+    /// Canonical 7-bit register selector: PPC 11-15 (low), PPC 26 (mid), PPC 21 (high).
    #[inline] pub fn va128(&self) -> usize {
-        (extract_bits(self.raw, 6, 10) | (extract_bits(self.raw, 29, 29) << 5)) as usize
+        (extract_bits(self.raw, 11, 15)
+            | (extract_bits(self.raw, 26, 26) << 5)
+            | (extract_bits(self.raw, 21, 21) << 6)) as usize
    }

-    /// VB128 (bits 16-20, plus bits from 28, 30)
+    /// VB128 = VB128l(5) | VB128h(2) << 5. Canary's VB128h is a 2-bit
+    /// contiguous field at PPC 30-31 (host bits 0-1).
    #[inline] pub fn vb128(&self) -> usize {
        (extract_bits(self.raw, 16, 20)
-            | (extract_bits(self.raw, 28, 28) << 5)
-            | (extract_bits(self.raw, 30, 30) << 6)) as usize
+            | (extract_bits(self.raw, 30, 31) << 5)) as usize
    }

-    /// VD128 (bits 6-10, plus bits from 21, 22)
+    /// VD128 = VD128l(5) | VD128h(2) << 5. Canary's VD128h is a 2-bit
+    /// contiguous field at PPC 28-29 (host bits 2-3).
    #[inline] pub fn vd128(&self) -> usize {
        (extract_bits(self.raw, 6, 10)
-            | (extract_bits(self.raw, 21, 21) << 5)
-            | (extract_bits(self.raw, 22, 22) << 6)) as usize
+            | (extract_bits(self.raw, 28, 29) << 5)) as usize
    }

    /// VS128 - same encoding as VD128
@@ -664,11 +671,11 @@ fn decode_op6(code: u32) -> PpcOpcode {
        _ => {}
    }

-    // VMX128 compare (non-dot and dot forms).
-    // Non-dot: bit 27 = 0.  Dot: bit 27 = 1, but bit 25 must also be 0 to
-    // distinguish from the shift/merge group (which has bit 25 = 1 when bit 27 = 1).
-    // key4_nd uses bits 22-24 + bit 27 (same as original, covers non-dot).
-    // key4_dt uses bits 22-24 + bit 25 + bit 27 (narrower, covers dot-only).
+    // VMX128 compare (VX128_R form). Single dispatch path: bit 27 = 0 always
+    // for these opcodes per canary's table (`ppc_opcode_table_gen.cc:295-305`).
+    // The Rc bit is at PPC 25 (host bit 6) per the FormatVX128_R bitfield —
+    // it's a runtime modifier read by the interpreter, NOT part of the
+    // secondary-opcode discrimination. PPCBUG-700.
    let key4_nd = (extract_bits(code, 22, 24) << 3) | extract_bits(code, 27, 27);
    match key4_nd {
        0b000000 => return PpcOpcode::vcmpeqfp128,
@@ -678,16 +685,6 @@ fn decode_op6(code: u32) -> PpcOpcode {
        0b100000 => return PpcOpcode::vcmpequw128,
        _ => {}
    }
-    // Dot forms: bit 27 = 1, bit 25 = 0  (key = bits22-24 + bit25 + bit27, low 3 bits)
-    let key4_dt = (extract_bits(code, 22, 24) << 2) | (extract_bits(code, 25, 25) << 1) | extract_bits(code, 27, 27);
-    match key4_dt {
-        0b00001 => return PpcOpcode::vcmpeqfp128,   // bits22-24=000, bit25=0, bit27=1
-        0b00101 => return PpcOpcode::vcmpgefp128,   // bits22-24=001, bit25=0, bit27=1
-        0b01001 => return PpcOpcode::vcmpgtfp128,   // bits22-24=010, bit25=0, bit27=1
-        0b01101 => return PpcOpcode::vcmpbfp128,    // bits22-24=011, bit25=0, bit27=1
-        0b10001 => return PpcOpcode::vcmpequw128,   // bits22-24=100, bit25=0, bit27=1
-        _ => {}
-    }

    // VMX128 shift/merge
    let key5 = (extract_bits(code, 22, 25) << 2) | extract_bits(code, 27, 27);
@@ -1055,21 +1052,26 @@ mod tests {
    // used those extractors). Each test poke-bits exactly the slots the
    // accessor reads and asserts the assembled register number.

-    fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32,
-                       vb16_20: u32, vb28: u32, vb30: u32) -> u32 {
-        // PPC bit i -> LSB position 31-i.
-        (vd6_10 << (31 - 10))
-            | (va21  << (31 - 21))   // va128 high bit at PPC 29 in some forms — kept 0 here
-            | (vd21  << (31 - 21))
-            | (vd22  << (31 - 22))
-            | (vb16_20 << (31 - 20))
-            | (vb28  << (31 - 28))
-            | (vb30  << (31 - 30))
+    /// Build a VMX128 test word for the canary-compliant register layout.
+    /// `vd128 = vd_lo | (vd_hi << 5)` where vd_lo is 5 bits (PPC 6-10) and
+    /// vd_hi is 2 bits (PPC 28-29). Same shape for vb128 (vb_lo at PPC 16-20,
+    /// vb_hi 2 bits at PPC 30-31). va128 = va_lo | (va_h26<<5) | (va_h21<<6)
+    /// per canary's 7-bit VA selector.
+    fn vmx128_test_word(vd_lo: u32, vd_hi: u32, va_lo: u32, va_h26: u32, va_h21: u32,
+                       vb_lo: u32, vb_hi: u32) -> u32 {
+        // PPC bit i -> host bit (31-i).
+        (vd_lo << (31 - 10))           // VD128l: PPC 6-10 = host 21-25
+            | (vd_hi << (31 - 29))      // VD128h: PPC 28-29 = host 2-3 (LSB at host 2)
+            | (va_lo << (31 - 15))      // VA128l: PPC 11-15 = host 16-20
+            | (va_h26 << (31 - 26))     // VA128h: PPC 26 = host 5
+            | (va_h21 << (31 - 21))     // VA128H: PPC 21 = host 10
+            | (vb_lo << (31 - 20))      // VB128l: PPC 16-20 = host 11-15
+            | (vb_hi << (31 - 31))      // VB128h: PPC 30-31 = host 0-1 (LSB at host 0)
    }

    #[test]
    fn vmx128_vd128_low_5_bits_only() {
-        // vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo
+        // vd_lo = 0..31, vd_hi = 0 → vd128 = vd_lo
        for r in 0..32u32 {
            let raw = (r as u32) << (31 - 10);
            let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
@@ -1078,45 +1080,51 @@ mod tests {
    }

    #[test]
-    fn vmx128_vd128_bit21_adds_32() {
-        // vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32
-        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21));
+    fn vmx128_vd128_high_low_bit_adds_32() {
+        // vd_lo = 0, VD128h = 0b01 (LSB only at host bit 2 = PPC 29) → vd128 = 32
+        let raw = (1u32 << (31 - 29));
        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
        assert_eq!(d.vd128(), 32);
    }

    #[test]
-    fn vmx128_vd128_bit22_adds_64() {
-        // vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64
-        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22));
+    fn vmx128_vd128_high_high_bit_adds_64() {
+        // vd_lo = 0, VD128h = 0b10 (MSB only at host bit 3 = PPC 28) → vd128 = 64
+        let raw = (1u32 << (31 - 28));
        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
        assert_eq!(d.vd128(), 64);
    }

    #[test]
    fn vmx128_vd128_full_127() {
-        // vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127
+        // vd_lo = 31, VD128h = 0b11 → vd128 = 127
        let raw = (31u32 << (31 - 10))
-            | (1u32 << (31 - 21))
-            | (1u32 << (31 - 22));
+            | (1u32 << (31 - 28))
+            | (1u32 << (31 - 29));
        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
        assert_eq!(d.vd128(), 127);
    }

    #[test]
-    fn vmx128_va128_uses_bit29() {
-        // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
-        let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29));
+    fn vmx128_va128_canary_layout() {
+        // va_lo = 7 at PPC 11-15, VA128h = 1 at PPC 26 → va128 = 7 | 32 = 39
+        let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 26));
        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
        assert_eq!(d.va128(), 39);
+        // VA128H = 1 at PPC 21 → va128 += 64 = 103
+        let raw = raw | (1u32 << (31 - 21));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.va128(), 7 | 32 | 64);
    }

    #[test]
-    fn vmx128_vb128_uses_bits28_and_30() {
-        // vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64.
-        let raw = (5u32 << (31 - 20))
-            | (1u32 << (31 - 28))
-            | (1u32 << (31 - 30));
+    fn vmx128_vb128_uses_bits30_31() {
+        // vb_lo = 5 at PPC 16-20. VB128h = 0b01 (LSB at PPC 31 = host 0) → +32.
+        // VB128h = 0b11 → +96.
+        let raw = (5u32 << (31 - 20)) | (1u32 << (31 - 31));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vb128(), 5 | 32);
+        let raw = raw | (1u32 << (31 - 30));
        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
        assert_eq!(d.vb128(), 5 | 32 | 64);
    }
@@ -1126,11 +1134,9 @@ mod tests {
        // vs128 must always equal vd128.
        for r in [0u32, 31, 32, 64, 96, 127] {
            let lo = r & 0x1F;
-            let b21 = (r >> 5) & 1;
-            let b22 = (r >> 6) & 1;
+            let hi = (r >> 5) & 0x3;
            let raw = (lo << (31 - 10))
-                | (b21 << (31 - 21))
-                | (b22 << (31 - 22));
+                | (hi << (31 - 29));
            let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
            assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}");
            assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}");
@@ -1142,7 +1148,8 @@ mod tests {
    #[allow(dead_code)]
    fn _vmx128_test_word_helper_compiles() {
        // Keep the helper validated against the real accessor.
-        let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0);
+        // vd_lo=5, vd_hi=0b11 → vd128 = 5 | 96 = 101
+        let raw = vmx128_test_word(5, 3, 0, 0, 0, 0, 0);
        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
        assert_eq!(d.vd128(), 5 | 32 | 64);
    }
--- a/crates/xenia-cpu/src/disasm.rs
+++ b/crates/xenia-cpu/src/disasm.rs
@@ -361,7 +361,15 @@ pub fn format(instr: &DecodedInstr) -> DisasmText {
        PpcOpcode::dcbz  => fmt_cache(instr, "dcbz"),
        PpcOpcode::dcbz128 => fmt_cache(instr, "dcbz128"),
        PpcOpcode::icbi  => fmt_cache(instr, "icbi"),
-        PpcOpcode::sync  => base("sync", String::new(), 0),
+        PpcOpcode::sync  => {
+            // L-field at PPC bit 10 (host bit 21) selects lwsync (L=1), the
+            // acquire barrier in every Xbox 360 spinlock. PPCBUG-641.
+            if (instr.raw >> 21) & 1 == 1 {
+                with_ext("sync", String::new(), 0, "lwsync", String::new(), 0)
+            } else {
+                base("sync", String::new(), 0)
+            }
+        }
        PpcOpcode::eieio => base("eieio", String::new(), 0),
        PpcOpcode::isync => base("isync", String::new(), 0),

@@ -828,7 +836,10 @@ fn fmt_bc(instr: &DecodedInstr) -> DisasmText {

        if decr {
            let z = if bo & 0x02 != 0 { "z" } else { "nz" };
-            let cond_str = cond_name_opt.unwrap_or("");
+            // BO bit 4 (uncond) means CR is ignored — pure CTR-decrement branch.
+            // Without this guard, bdnz/bdz would emit a spurious `ge` suffix derived
+            // from the don't-care BI=0 / cond_true=false pair (PPCBUG-640).
+            let cond_str = if uncond { "" } else { cond_name_opt.unwrap_or("") };
            let ext_mnem = format!("bd{z}{cond_str}{a}{l}");
            let ext_ops = format!("{cr}0x{target:08X}");
            with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -846,7 +846,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            }

            let ctr_ok = (bo & 0b00100) != 0
-                || ((ctx.ctr != 0) ^ ((bo & 0b00010) != 0));
+                || (((ctx.ctr as u32) != 0) ^ ((bo & 0b00010) != 0));
            let cond_ok = (bo & 0b10000) != 0
                || (ctx.get_cr_bit(bi) == ((bo & 0b01000) != 0));

@@ -876,7 +876,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            }

            let ctr_ok = (bo & 0b00100) != 0
-                || ((ctx.ctr != 0) ^ ((bo & 0b00010) != 0));
+                || (((ctx.ctr as u32) != 0) ^ ((bo & 0b00010) != 0));
            let cond_ok = (bo & 0b10000) != 0
                || (ctx.get_cr_bit(bi) == ((bo & 0b01000) != 0));

@@ -1520,7 +1520,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            match spr {
                crate::context::spr::XER => ctx.set_xer(val as u32),
                crate::context::spr::LR => ctx.lr = val,
-                crate::context::spr::CTR => ctx.ctr = val,
+                crate::context::spr::CTR => ctx.ctr = val as u32 as u64,
                crate::context::spr::DEC => ctx.dec = val as u32,
                crate::context::spr::TBL_WRITE => {
                    ctx.timebase = (ctx.timebase & 0xFFFF_FFFF_0000_0000) | (val & 0xFFFF_FFFF);
@@ -1919,11 +1919,10 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::vmaddfp128 => {
-            // VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator;
-            // Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through
-            // `InstrEmit_vmaddfp_` with arg order swapped so the resulting
-            // HIR computation is `VA * VB + VD`). Same unconditional denorm
-            // flush of all three inputs as scalar `vmaddfp`.
+            // ISA: (VD) <- (VA × VD) + VB. VD is both the second multiplicand and destination.
+            // Canary InstrEmit_vmaddfp128 (ppc_emit_altivec.cc:806-809): MulAdd(VA, VD, VB).
+            // Previous code computed ai.mul_add(bi, di) = VA×VB+VD — VB and VD roles swapped
+            // (PPCBUG-424). Fix: ai.mul_add(di, bi) = VA×VD+VB.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let d = ctx.vr[instr.vd128()].as_f32x4();
@@ -1932,7 +1931,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(bi, di);
+                r[i] = ai.mul_add(di, bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -2982,16 +2981,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::stvewx128 => {
-            let ea = ea_indexed(ctx, instr) & !0xF;
-            // TODO PPCBUG-510: stvewx128 currently writes 16 bytes at ea & !0xF; the EA scope is
-            // wrong (should be word-aligned, 4 bytes only). When P3 fixes EA, this invalidate's
-            // range narrows automatically.
-            // PPCBUG-512: stvewx128 was missing invalidate_for_write.
+            // Mirror of stvewx: word-align EA, extract one 32-bit lane, write 4 bytes only.
+            // Previous code used & !0xF (16-byte) and wrote all 16 bytes, corrupting 12
+            // adjacent bytes on every execution (PPCBUG-510).
+            let ea_unaligned = ea_indexed(ctx, instr);
+            let ea = ea_unaligned & !0x3u32;
            if let Some(t) = ctx.reservation_table.as_ref().filter(|t| t.is_enabled()) {
                if t.has_active_reservers() { t.invalidate_for_write(ea); }
            }
+            let slot = ((ea_unaligned & 0xF) >> 2) as usize;
            let bytes = ctx.vr[instr.vs128()].as_bytes();
-            for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); }
+            let w = ((bytes[slot * 4] as u32) << 24)
+                  | ((bytes[slot * 4 + 1] as u32) << 16)
+                  | ((bytes[slot * 4 + 2] as u32) << 8)
+                  | (bytes[slot * 4 + 3] as u32);
+            mem.write_u32(ea, w);
            ctx.pc += 4;
        }

@@ -4292,11 +4296,11 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
        // ═════════════════════════════════════════════════════════════════
        // §4j — VMX128 FMA / permute
        // ═════════════════════════════════════════════════════════════════
-        // vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator)
+        // vmaddcfp128: ISA (VD) <- (VA × VD) + VB — same operation as vmaddfp128
        PpcOpcode::vmaddcfp128 => {
-            // Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both
-            // a source operand (as multiplicand) and the destination. Canary &
-            // POWER8 hardware confirm denormal inputs are flushed regardless of NJ.
+            // ISA: (VD) <- (VA × VD) + VB. Canary InstrEmit_vmaddcfp128 (cc:819): MulAdd(VA, VD, VB).
+            // Previous code computed di.mul_add(bi, ai) = VD×VB+VA — both operands wrong
+            // (PPCBUG-425). Fix: ai.mul_add(di, bi) = VA×VD+VB.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let d = ctx.vr[instr.vd128()].as_f32x4();
@@ -4305,7 +4309,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = di.mul_add(bi, ai);
+                r[i] = ai.mul_add(di, bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -5319,32 +5323,64 @@ mod tests {
    }

    /// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101)
-    /// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary
+    /// reuses vD as the accumulator: `vD <- (vA × vD) + vB`. Canary
    /// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs
-    /// unconditionally before the fused multiply-add — the 128-bit form
-    /// must match the scalar `vmaddfp` behaviour. Prior to this fix the
-    /// interpreter skipped the flush, leaving subnormal noise in math-
-    /// heavy game code.
+    /// unconditionally before the fused multiply-add.
    #[test]
    fn vmaddfp128_flushes_denormal_inputs() {
        let mut ctx = PpcContext::new();
        let mut mem = TestMem::new();
        let denorm = f32::from_bits(1);
-        // vA=v2 carries denorms, which is also vD's accumulator input.
+        // VA=v1, VD=v2, VB=v3 — all carry denormals.
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
-        // vB=v3 = 1.0 — denormal input survives only if not flushed.
-        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]);
-        // vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared
-        // between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101.
-        let raw: u32 = 0x1440_18D0;
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
+        // vmaddfp128 vD=v2, vA=v1, vB=v3: op6=5, vd_lo=2, va_lo=1, vb_lo=3, key2=0b001101.
+        // VA×VD+VB: all three flushed → 0*0+0 = 0.
+        let raw: u32 = (5u32 << 26) | (2 << 21) | (1 << 16) | (3 << 11) | (3 << 6) | (1 << 4);
        write_instr(&mut mem, 0, raw);
        ctx.pc = 0;
        step(&mut ctx, &mut mem);
-        // Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0.
-        // With flush:    0*0 + 0 = 0.
        assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]);
    }

+    // ---- PPCBUG-424+425: vmaddfp128/vmaddcfp128 operand swap ----
+    // ISA for both: (VD) <- (VA × VD) + VB. Previous code computed VA×VB+VD and VD×VB+VA.
+    // Test uses distinct VA, VB, VD registers so the swap is visible.
+    // Encoding: op6=5, key2=0b001101 (vmaddfp128) / 0b010001 (vmaddcfp128).
+    // VA=v1=[2.0], VB=v2=[10.0], VD=v3=[3.0] → expected 2.0×3.0+10.0 = 16.0.
+    // Buggy vmaddfp128: 2.0×10.0+3.0 = 23.0. Buggy vmaddcfp128: 3.0×10.0+2.0 = 32.0.
+
+    #[test]
+    fn vmaddfp128_operand_order_va_times_vd_plus_vb() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]);  // VA=v1
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]);  // VD=v3 (also destination)
+        // vmaddfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b001101 (bits22-25=3, bit27=1)
+        let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (1 << 4);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
+    }
+
+    #[test]
+    fn vmaddcfp128_operand_order_va_times_vd_plus_vb() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]);  // VA=v1
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]);  // VD=v3
+        // vmaddcfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b010001 (bits22-25=4, bit27=1)
+        let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (4 << 6) | (1 << 4);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16");
+    }
+
    /// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary
    /// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the
    /// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`.
@@ -5798,6 +5834,59 @@ mod tests {
        assert_eq!(ctx.timebase >> 32, 0xAAAA_BBBB);
    }

+    // PPCBUG-053: bcx CTR zero-test must use 32-bit comparison. When prior
+    // 64-bit pollution (e.g. via negx → mtctr) leaves CTR upper 32 bits
+    // non-zero, the 64-bit `ctx.ctr != 0` would loop forever even when the
+    // 32-bit counter has decremented to zero.
+    #[test]
+    fn bcx_bdnz_uses_32bit_ctr_compare() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.ctr = 0x0000_0001_0000_0001;
+        // bdnz +8: BO=16 (decrement, branch if CTR!=0, ignore CR), BI=0, BD/4=2
+        let raw = (16u32 << 26) | (16 << 21) | (0 << 16) | (2 << 2);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // After decrement: low 32 = 0, high 32 = 1. 32-bit test says zero → no branch.
+        assert_eq!(ctx.ctr, 0x0000_0001_0000_0000);
+        assert_eq!(ctx.pc, 4);
+    }
+
+    #[test]
+    fn bclrx_uses_32bit_ctr_compare() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.ctr = 0x0000_0001_0000_0001;
+        ctx.lr = 0x100;
+        // bdnzlr: opcode 19, BO=16 (decrement, branch if CTR!=0), BI=0, XO=16
+        let raw = (19u32 << 26) | (16 << 21) | (0 << 16) | (16 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // 32-bit CTR=0 after decrement → don't branch to LR.
+        assert_eq!(ctx.ctr, 0x0000_0001_0000_0000);
+        assert_eq!(ctx.pc, 4);
+    }
+
+    // PPCBUG-054: mtspr CTR must truncate the source GPR to 32 bits, matching
+    // canary's `f.Truncate(ctr, INT32_TYPE)`. Prevents upstream 64-bit GPR
+    // pollution from poisoning the 32-bit CTR counter independently of the
+    // bcx zero-test fix.
+    #[test]
+    fn mtspr_ctr_truncates_to_32_bits() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = 0xFFFF_FFFF_8000_0001;
+        // mtspr CTR (9), r3
+        let spr_swapped = ((9u32 & 0x1F) << 5) | ((9u32 >> 5) & 0x1F);
+        let raw = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (467 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.ctr, 0x8000_0001);
+    }
+
    // ---------- Block-cache parity tests ----------
    //
    // These confirm that running a program through the basic-block
@@ -6395,13 +6484,25 @@ mod tests {
    // For vpkd3d128, PPC bits 21-22 are always 1 (key2 discriminant), so
    // vd128 is always in range [96, 127] for vd_lo in [0, 31].

-    fn encode_vpkd3d128(vd_lo: u32, vb_lo: u32, imm: u32, z: u32) -> u32 {
-        // op6=6 (all VMX128 compute ops); VD[4:0] at host 25-21; IMM at host 20-16;
-        // VB[4:0] at host 15-11; host bits 10,9 = 1,1 (PPC bits 21,22, key2=0b110);
-        // z (2-bit) at host 7-6 (PPC bits 24-25); host bit 4 = 1 (PPC bit 27, key2 low=0b01).
-        // decode_op6 key2 = (bits21-23<<4)|(bits26-27) = (0b110<<4)|0b01 = 0b1100001.
-        (6u32 << 26) | (vd_lo << 21) | (imm << 16) | (vb_lo << 11)
-            | (1 << 10) | (1 << 9) | (z << 6) | (1 << 4)
+    fn encode_vpkd3d128(vd: u32, vb_lo: u32, imm: u32, z: u32) -> u32 {
+        // op6=6, FormatVX128_4 layout (canary):
+        //   VD low at PPC 6-10 (host 21-25); VD high (2 bits) at PPC 28-29 (host 2-3).
+        //   IMM at PPC 11-15; VB low at PPC 16-20.
+        //   z (2-bit) at PPC 24-25 (host 6-7).
+        //   key2 = 0b1100001 over bits 21-23 + 26-27:
+        //     bits 21-23 = 0b110 → bit 21=1, bit 22=1, bit 23=0
+        //     bits 26-27 = 0b01  → bit 26=0, bit 27=1
+        let vd_lo = vd & 0x1F;
+        let vd_hi = (vd >> 5) & 0x3;
+        (6u32 << 26)
+            | (vd_lo << 21)
+            | (vd_hi << 2)
+            | (imm << 16)
+            | (vb_lo << 11)
+            | (1 << 10)        // bit 21 (key2)
+            | (1 << 9)         // bit 22 (key2)
+            | (z << 6)         // z at PPC 24-25
+            | (1 << 4)         // bit 27 (key2)
    }

    #[test]
@@ -6415,7 +6516,7 @@ mod tests {
        ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0);
        // prev vd=96: sentinel values that should NOT appear in result
        ctx.vr[96] = xenia_types::Vec128::from_u32x4(0xAAAA_AAAA, 0xBBBB_BBBB, 0xCCCC_CCCC, 0xDDDD_DDDD);
-        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 0, 0));
+        write_instr(&mem, 0, encode_vpkd3d128(96, 1, 0, 0));
        ctx.pc = 0;
        step(&mut ctx, &mem);
        let r = ctx.vr[96].as_u32x4();
@@ -6435,7 +6536,7 @@ mod tests {
        let mem = TestMem::new();
        ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000
        ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
-        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 0));
+        write_instr(&mem, 0, encode_vpkd3d128(96, 1, 1, 0));
        ctx.pc = 0;
        step(&mut ctx, &mem);
        let r = ctx.vr[96].as_u32x4();
@@ -6454,7 +6555,7 @@ mod tests {
        let mem = TestMem::new();
        ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000
        ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
-        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 3));
+        write_instr(&mem, 0, encode_vpkd3d128(96, 1, 1, 3));
        ctx.pc = 0;
        step(&mut ctx, &mem);
        let r = ctx.vr[96].as_u32x4();
@@ -6463,4 +6564,47 @@ mod tests {
        assert_eq!(r[2], 0x3333_3333, "pack=1 shift=3: lane 2 from prev");
        assert_eq!(r[3], 0x4444_4444, "pack=1 shift=3: lane 3 from prev");
    }
+
+    // ---- PPCBUG-510: stvewx128 should write one word (4 bytes), not 16 ----
+
+    fn encode_stvewx128(vs_lo: u32, ra: u32, rb: u32) -> u32 {
+        // stvewx128 is a VMX128 load/store at op6=4.
+        // decode_op4 key1 = (bits21-27 << 4) | bits30-31 = 0b00110000011 for stvewx128.
+        // bits21-27 = 0b0011000 (host bits 10-4), bits30-31 = 0b11 (host bits 1-0).
+        // VS128[4:0] at host bits 25-21; RA at host bits 20-16; RB at host bits 15-11.
+        // VS128[5] at host bit 3 (PPC bit 28); VS128[6] at host bit 1 (PPC bit 30).
+        (4u32 << 26)
+            | (vs_lo << 21)     // VS128[4:0]
+            | (ra << 16)        // RA
+            | (rb << 11)        // RB
+            | (0b0011000 << 4)  // bits 21-27 of key1 pattern
+            | 0b11              // bits 30-31 of key1 pattern
+    }
+
+    #[test]
+    fn stvewx128_writes_one_word_at_word_aligned_ea() {
+        // PPCBUG-510: old code wrote all 16 bytes at ea & !0xF, corrupting 12 adjacent bytes.
+        // Fix: word-align EA, extract lane from (ea & 0xF) >> 2, write 4 bytes only.
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        // VS128 = v96 (vs_lo=0 | key bits → vs128=0 since key bits 21-27 set bit4=1 and bit5=1
+        // in the key, but vs128 uses bits 6-10 for low 5 bits).
+        // Actually: vs128 uses decode bits 6-10 (host 25-21) and bits 21,22 (host 10,9).
+        // encode_stvewx128 sets vs_lo in bits 25-21 and key bits at bits 10-4.
+        // vs128 = bits6-10 | (bit21<<5) | (bit22<<6) = vs_lo | 0 | 0 = vs_lo.
+        // So vs128 = vs_lo. We'll use vs_lo=3 → vs128=3.
+        let raw = encode_stvewx128(3, 1, 2);
+        ctx.vr[3] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
+        ctx.gpr[1] = 0x1000; // base
+        ctx.gpr[2] = 0x008;  // offset → EA = 0x1008 → word-aligned EA = 0x1008, slot = (0x8 & 0xF)>>2 = 2
+        write_instr(&mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mem);
+        assert_eq!(ctx.pc, 4, "PC must advance");
+        // Slot 2 → lane 2 = 0x3333_3333
+        assert_eq!(mem.read_u32(0x1008), 0x3333_3333, "only lane 2 word at ea");
+        // Adjacent words must be untouched (mem is zero-init)
+        assert_eq!(mem.read_u32(0x1000), 0x0000_0000, "byte below must be untouched");
+        assert_eq!(mem.read_u32(0x100C), 0x0000_0000, "byte above must be untouched");
+    }
 }
--- a/crates/xenia-cpu/tests/disasm_goldens.rs
+++ b/crates/xenia-cpu/tests/disasm_goldens.rs
@@ -50,6 +50,23 @@ fn fixture_path(name: &str) -> PathBuf {
        .join(name)
 }

+/// Encode a VMX128 VX128-form (or VX128_R/_2) instruction with canary's
+/// 7-bit register layout: VD low at PPC 6-10, high 2 bits at PPC 28-29;
+/// VA low at PPC 11-15, mid bit at PPC 26, high bit at PPC 21; VB low at
+/// PPC 16-20, high 2 bits at PPC 30-31. `secondary_bits` carries any
+/// secondary opcode + VC + Rc + key bits the caller needs.
+fn encode_vx128(op6: u32, vd: u32, va: u32, vb: u32, secondary_bits: u32) -> u32 {
+    ((op6 & 0x3F) << 26)
+        | ((vd & 0x1F) << 21)
+        | (((vd >> 5) & 0x3) << 2)
+        | ((va & 0x1F) << 16)
+        | (((va >> 5) & 0x1) << 5)
+        | (((va >> 6) & 0x1) << 10)
+        | ((vb & 0x1F) << 11)
+        | (((vb >> 5) & 0x3) << 0)
+        | secondary_bits
+}
+
 fn build_rows(cases: &[(u32, u32, &str)]) -> Vec<GoldenRow> {
    cases
        .iter()
@@ -428,77 +445,57 @@ fn vmx128_registers() {
        ((4u32 << 26) | (5 << 11) | 1604, 0x82000000, "mtvscr v5"),
    ];

-    // VMX128 op=5 — uses vd128/va128/vb128 (7-bit registers, high bits at
-    // 21+22). These are the silent-bug-area encodings; we exercise low
-    // register indices here because the secondary-opcode key for op=5
-    // includes bits 21-22, constraining vd128 high bits to 0 in this form.
-    // High-index examples for vd128 live in the op=6 series below.
+    // VMX128 op=5: vperm128 v3, v4, v5, vc=0. Canary FormatVX128: VD low
+    // at PPC 6-10, VA low at PPC 11-15, VB low at PPC 16-20, VC at PPC 23-25.
+    // key1 = (bit22<<5)|bit27 = 0 selects vperm128.
    let vmx128_op5 = [
-        // vaddfp128 v3, v4, v5   : op=5, key2=0b000001
-        ((5u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (0 << 6) | (1 << 0), 0x82000000, "vaddfp128 (encoded sloppily)"),
+        (encode_vx128(5, 3, 4, 5, 0), 0x82000000, "vperm128 v3, v4, v5, 0 (canary)"),
    ];

-    // VMX128 op=6 — vrlimi128 has secondary key in bits 23-25 + 26-27, so
-    // bits 21-22 ARE the high bits of vd128 (canonical silent-bug-area).
-    // These instructions exercise vd128 = 32, 64, 96 — covering the bit-21
-    // and bit-22 split that ppc.rs's old extractor (now deleted) miscoded.
-    let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 {
-        // op=6, vd128 = bits 6-10 + bit 21 + bit 22, vb128 = bits 16-20 + bits 30+31,
-        //  IMM = bits 11-15, Z = bits 24-25, key2 = (bits 23-25 << 4) | bits 26-27 = 0b1110001
-        let vd_lo = vd & 0x1F;
-        let vd_b21 = (vd >> 5) & 1;
-        let vd_b22 = (vd >> 6) & 1;
-        let vb_lo = vb & 0x1F;
-        let vb_b30 = (vb >> 5) & 1;
-        let vb_b31 = (vb >> 6) & 1;
-        // bits 23-25 = 111, bits 26-27 = 00, bit 27 = 1 → key2 lower 4 bits = 0001
-        // Encoded: bits 23-25 = 111, bits 26-27 = 00 are actually overlapping with z field (bits 24-25)
-        // The plan view: (bits 23 << 6) | (bits 24-25 << 4) | (bits 26-27 << 2) but the table uses different.
-        // Easiest: hand-encode known bit pattern matching decoder.rs's match:
-        //   key2 = (extract_bits(code, 23, 25) << 4) | extract_bits(code, 26, 27) = 0b1110001
-        //   bits 23-25 = 111, bits 26-27 = 01
-        // Bit positions 23-27 = 11101 (5 bits, MSB at 23).
-        //   PPC bit 23 (LSB index 8): set
-        //   PPC bit 24 (LSB index 7): set  -- this is z bit 0
-        //   PPC bit 25 (LSB index 6): set  -- this is z bit 1
-        //   PPC bit 26 (LSB index 5): unset
-        //   PPC bit 27 (LSB index 4): set
-        // We let z = bits 24-25 stored with vd128 bits at 21-22.
-        // To preserve key2 = 0b1110001, we need bits 24-25 = 11, bit 26 = 0, bit 27 = 1.
-        // BUT bits 24-25 ARE the z field; if we set them = 11 the z value is 3.
-        // So Z is constrained for vrlimi128. Choose Z = 3 (matches Sylpheed examples).
-        let z3 = z & 0x3;
-        (6u32 << 26)
-            | (vd_lo << 21)
-            | (imm << 16)
-            | (vb_lo << 11)
-            | (vd_b21 << 10)  // bit 21 (LSB pos 10)
-            | (vd_b22 << 9)   // bit 22 (LSB pos 9)
-            | (1 << 8)        // bit 23
-            | (z3 << 6)       // bits 24-25
-            | (0 << 5)        // bit 26
-            | (1 << 4)        // bit 27
-            | (vb_b30 << 1)   // bit 30
-            | vb_b31          // bit 31
+    // VMX128 op=6 — exercise full 0-127 vd128 range under canary's layout.
+    // VD128h is at PPC 28-29 (host 2-3): no overlap with secondary opcode key,
+    // so vd can be freely 0-127 for any op6 instruction.
+    let vsrw128 = |vd: u32, vb: u32| -> u32 {
+        // vsrw128 secondary: 0x000001D0 (decode_op6 key5 = 0b011101).
+        encode_vx128(6, vd, 0, vb, 0x000001D0)
+    };
+    let vpermwi128 = |vd: u32, vb: u32, perm: u32| -> u32 {
+        // vpermwi128: PERMl at PPC 11-15, PERMh at PPC 23-25, key1 sets bit 22 + bit 27.
+        let perml = perm & 0x1F;
+        let permh = (perm >> 5) & 0x7;
+        let mut raw = (6u32 << 26)
+            | ((vd & 0x1F) << 21)
+            | (((vd >> 5) & 0x3) << 2)  // VD128h
+            | (perml << 16)
+            | ((vb & 0x1F) << 11)
+            | (((vb >> 5) & 0x3) << 0)  // VB128h
+            | (permh << 6)              // PERMh at PPC 23-25
+            | (1 << 9)                  // bit 22 (key1 high)
+            | (1 << 4);                 // bit 27 (key1 low)
+        raw &= !(1 << 10); // PPC 21 = 0 for vpermwi128
+        raw
+    };
+    let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 {
+        // vrlimi128: IMM at PPC 11-15, z at PPC 24-25, key2 = 0b1110001 over
+        // bits 21-23 + 26-27 → bits 21,22,23 = 1, bit 26 = 0, bit 27 = 1.
+        (6u32 << 26)
+            | ((vd & 0x1F) << 21)
+            | (((vd >> 5) & 0x3) << 2)   // VD128h
+            | ((imm & 0x1F) << 16)
+            | ((vb & 0x1F) << 11)
+            | (((vb >> 5) & 0x3) << 0)   // VB128h
+            | ((z & 0x3) << 6)           // z at PPC 24-25 = host 6-7
+            | (1 << 8)                   // bit 23 (key2)
+            | (1 << 9)                   // bit 22 (key2)
+            | (1 << 10)                  // bit 21 (key2)
+            | (1 << 4)                   // bit 27 (key2)
    };
-    // Note: VMX128 op6 secondary keys constrain bits 21-23. For
-    // vrlimi128 (key2 = 0b1110001 over bits 21-23 + 26-27) the only
-    // valid vd128 range is 96..=127 — lower values change the secondary
-    // key into some other instruction. The cases below record what the
-    // disassembler emits for the borderline encodings, so a regression
-    // in either the lookup table or the formatter would surface here.
    let vmx128_high = [
-        // bits 21-22 = 00 → key2 ≠ vrlimi128 → decodes to vsrw128 (key5
-        // branch). Locks current behavior; shows the silent-bug-area
-        // encoding constraint.
-        (vrlimi128(0, 12, 4, 3), 0x82000000, "encoding vd_hi=00: actually vsrw128"),
-        // bits 21-22 = 10 → still not vrlimi128.
-        (vrlimi128(32, 12, 4, 3), 0x82000000, "encoding vd_hi=10: actually vsrw128 v32"),
-        // bits 21-22 = 01 → key1 matches vpermwi128.
-        (vrlimi128(64, 12, 4, 3), 0x82000000, "encoding vd_hi=01: actually vpermwi128"),
-        // bits 21-22 = 11 → key2 matches vrlimi128 with vd128=96.
-        (vrlimi128(96, 12, 4, 3), 0x82000000, "vrlimi128 v96, v12, 4, 3 (real)"),
-        (vrlimi128(127, 127, 4, 3), 0x82000000, "vrlimi128 v127, v127, 4, 3 (real)"),
+        (vsrw128(0, 12),               0x82000000, "vsrw128 v0, v0, v12 (canary, vd_hi=00)"),
+        (vsrw128(32, 12),              0x82000000, "vsrw128 v32, v0, v12 (canary, VD128h=01)"),
+        (vpermwi128(64, 12, 0xE4),     0x82000000, "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)"),
+        (vrlimi128(96, 12, 4, 3),      0x82000000, "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)"),
+        (vrlimi128(127, 95, 4, 3),     0x82000000, "vrlimi128 v127, v95, 4, 3 (canary)"),
    ];

    // Fix 4: VMX128 multiply-add 4-operand layouts. Per canary, the addend
@@ -514,12 +511,11 @@ fn vmx128_registers() {
    //     vmaddcfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
    //     vnmsubfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
    let vmx128_4op = [
-        // vmaddfp128: bits 24=1, 25=1, 27=1, bit 29=1 (VA high), VB=5
-        (0x146028D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
-        // vmaddcfp128: bits 23=1, 27=1, bit 29=1, VB=5
-        (0x14602914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
-        // vnmsubfp128: bits 23=1, 25=1, 27=1, bit 29=1, VB=5
-        (0x14602954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
+        // Canary FormatVX128 layout: vd=3 (PPC 6-10), va=35 (low 3 at PPC 11-15 + VA128h=1 at PPC 26),
+        // vb=5 (PPC 16-20), key2 at PPC 22-25 + bit 27.
+        (0x146328F0u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
+        (0x14632930u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
+        (0x14632970u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
    ];

    let mut all = Vec::new();
--- a/crates/xenia-cpu/tests/golden/extended_mnemonics.json
+++ b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
@@ -366,7 +366,7 @@
      "addr": "0x82000000",
      "mnemonic": "bc",
      "operands": "16, lt, 0x82000040",
-      "ext_mnemonic": "bdnzge",
+      "ext_mnemonic": "bdnz",
      "ext_operands": "0x82000040",
      "branch_target": "0x82000040"
    },
@@ -376,7 +376,7 @@
      "addr": "0x82000000",
      "mnemonic": "bc",
      "operands": "18, lt, 0x82000040",
-      "ext_mnemonic": "bdzge",
+      "ext_mnemonic": "bdz",
      "ext_operands": "0x82000040",
      "branch_target": "0x82000040"
    },
@@ -525,7 +525,9 @@
      "raw": "0x7C2004AC",
      "addr": "0x82000000",
      "mnemonic": "sync",
-      "operands": ""
+      "operands": "",
+      "ext_mnemonic": "lwsync",
+      "ext_operands": ""
    },
    {
      "label": "trap",
--- a/crates/xenia-cpu/tests/golden/vmx128_registers.json
+++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json
@@ -71,64 +71,64 @@
      "operands": "v5"
    },
    {
-      "label": "vaddfp128 (encoded sloppily)",
-      "raw": "0x14642801",
+      "label": "vperm128 v3, v4, v5, 0 (canary)",
+      "raw": "0x14642800",
      "addr": "0x82000000",
      "mnemonic": "vperm128",
-      "operands": "v3, v3, v5, 0"
+      "operands": "v3, v4, v5, 0"
    },
    {
-      "label": "encoding vd_hi=00: actually vsrw128",
-      "raw": "0x180461D0",
+      "label": "vsrw128 v0, v0, v12 (canary, vd_hi=00)",
+      "raw": "0x180061D0",
      "addr": "0x82000000",
      "mnemonic": "vsrw128",
      "operands": "v0, v0, v12"
    },
    {
-      "label": "encoding vd_hi=10: actually vsrw128 v32",
-      "raw": "0x180465D0",
+      "label": "vsrw128 v32, v0, v12 (canary, VD128h=01)",
+      "raw": "0x180061D4",
      "addr": "0x82000000",
      "mnemonic": "vsrw128",
      "operands": "v32, v0, v12"
    },
    {
-      "label": "encoding vd_hi=01: actually vpermwi128",
-      "raw": "0x180463D0",
+      "label": "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)",
+      "raw": "0x180463D8",
      "addr": "0x82000000",
      "mnemonic": "vpermwi128",
      "operands": "v64, v12, 0xE4"
    },
    {
-      "label": "vrlimi128 v96, v12, 4, 3 (real)",
-      "raw": "0x180467D0",
+      "label": "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)",
+      "raw": "0x180467DC",
      "addr": "0x82000000",
      "mnemonic": "vrlimi128",
      "operands": "v96, v12, 4, 3"
    },
    {
-      "label": "vrlimi128 v127, v127, 4, 3 (real)",
-      "raw": "0x1BE4FFD3",
+      "label": "vrlimi128 v127, v95, 4, 3 (canary)",
+      "raw": "0x1BE4FFDE",
      "addr": "0x82000000",
      "mnemonic": "vrlimi128",
      "operands": "v127, v95, 4, 3"
    },
    {
      "label": "vmaddfp128 v3, v35, v5, v3",
-      "raw": "0x146028D4",
+      "raw": "0x146328F0",
      "addr": "0x82000000",
      "mnemonic": "vmaddfp128",
      "operands": "v3, v35, v5, v3"
    },
    {
      "label": "vmaddcfp128 v3, v35, v3, v5",
-      "raw": "0x14602914",
+      "raw": "0x14632930",
      "addr": "0x82000000",
      "mnemonic": "vmaddcfp128",
      "operands": "v3, v35, v3, v5"
    },
    {
      "label": "vnmsubfp128 v3, v35, v3, v5",
-      "raw": "0x14602954",
+      "raw": "0x14632970",
      "addr": "0x82000000",
      "mnemonic": "vnmsubfp128",
      "operands": "v3, v35, v3, v5"