From 147daa0721fcd6971f3e2e0a6d7c9a4f1ea71d68 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 1 May 2026 20:54:40 +0200
Subject: [PATCH 1/6] fix(cpu): PPCBUG-040 PPCBUG-560 fix sh64() bit order and
 rldicl test helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPCBUG-040: decoder.rs sh64() assembled the XS-form shift amount as
(SH[4:0] << 1) | SH[5] instead of (SH[5] << 5) | SH[4:0]. Every
`sradi` with shift N ∈ 1..=62 executed with a completely wrong shift
count (e.g. shift=32 executed as shift=1).

PPCBUG-560: disasm_goldens.rs rldicl() test helper was encoding sh[5:1]
at PPC bits 16-20 and sh[0] at PPC bit 30 — exactly backwards. The wrong
encoder and wrong decoder cancelled out, hiding PPCBUG-040 from tests.
Fix both together so tests validate ISA-correct encodings.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/xenia-cpu/src/decoder.rs               |  2 +-
 crates/xenia-cpu/tests/disasm_goldens.rs      | 37 +++++++++++++++----
 .../tests/golden/extended_mnemonics.json      |  4 +-
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index 136d3e3..28e55dc 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -89,7 +89,7 @@ impl DecodedInstr {
 
     /// SH field for 64-bit shifts (bits 16-20 + bit 30)
     #[inline] pub fn sh64(&self) -> u32 {
-        (extract_bits(self.raw, 16, 20) << 1) | extract_bits(self.raw, 30, 30)
+        (extract_bits(self.raw, 30, 30) << 5) | extract_bits(self.raw, 16, 20)
     }
 
     /// SPR field (bits 11-20, swapped halves)
diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs
index 6c39d54..97b3825 100644
--- a/crates/xenia-cpu/tests/disasm_goldens.rs
+++ b/crates/xenia-cpu/tests/disasm_goldens.rs
@@ -20,7 +20,7 @@ use std::path::PathBuf;
 
 use serde::{Deserialize, Serialize};
 
-use xenia_cpu::decoder::decode;
+use xenia_cpu::decoder::{DecodedInstr, decode};
 use xenia_cpu::disasm::format;
 
 #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
@@ -158,20 +158,20 @@ fn rlwinm(rs: u32, ra: u32, sh: u32, mb: u32, me: u32, rc: u32) -> u32 {
 }
 
 fn rldicl(rs: u32, ra: u32, sh: u32, mb: u32, rc: u32) -> u32 {
-    // MD-form, op30 xo=0. sh split: bits 16-20 (high 5) + bit 30 (low bit).
-    // mb split: bits 21-25 (low 5) + bit 26 (high bit).
-    let sh_hi = (sh >> 1) & 0x1F;
-    let sh_lo = sh & 1;
+    // MD-form: sh[4:0] at PPC bits 16-20 (host bits 11-15); sh[5] at PPC bit 30 (host bit 1).
+    // mb[4:0] at PPC bits 21-25 (host bits 6-10); mb[5] at PPC bit 26 (host bit 5).
+    let sh_lo = sh & 0x1F;
+    let sh_hi = (sh >> 5) & 1;
     let mb_lo = mb & 0x1F;
     let mb_hi = (mb >> 5) & 1;
     (30 << 26)
         | (rs << 21)
         | (ra << 16)
-        | (sh_hi << 11)
+        | (sh_lo << 11)
         | (mb_lo << 6)
         | (mb_hi << 5)
         | (0 << 2)
-        | (sh_lo << 1)
+        | (sh_hi << 1)
         | rc
 }
 
@@ -529,3 +529,26 @@ fn vmx128_registers() {
     all.extend_from_slice(&vmx128_4op);
     assert_or_regen("vmx128_registers.json", &all);
 }
+
+#[test]
+fn sradi_shift_32_decodes_to_32() {
+    // sradi rA, rS, 32: sh=32 → sh[4:0]=0, sh[5]=1
+    // After PPCBUG-040 fix, sh64() must return 32, not 1.
+    let instr: DecodedInstr = decode(rldicl(3, 4, 32, 63, 0), 0);
+    // rldicl with mb=63 is not sradi, but tests sh64() extraction.
+    assert_eq!(instr.sh64(), 32, "sh64 must return 32 for sh=32 (sh5=1, sh_lo=0)");
+}
+
+#[test]
+fn sh64_shift_1_decodes_correctly() {
+    // sh=1: sh[4:0]=1, sh[5]=0 → sh64() must return 1
+    let instr: DecodedInstr = decode(rldicl(3, 4, 1, 0, 0), 0);
+    assert_eq!(instr.sh64(), 1, "sh64 must return 1 for sh=1");
+}
+
+#[test]
+fn sh64_shift_63_decodes_correctly() {
+    // sh=63: sh[4:0]=31=0x1F, sh[5]=1 → sh64() must return 63
+    let instr: DecodedInstr = decode(rldicl(3, 4, 63, 0, 0), 0);
+    assert_eq!(instr.sh64(), 63, "sh64 must return 63 for sh=63");
+}
diff --git a/crates/xenia-cpu/tests/golden/extended_mnemonics.json b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
index d869109..efb251b 100644
--- a/crates/xenia-cpu/tests/golden/extended_mnemonics.json
+++ b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
@@ -182,7 +182,7 @@
     },
     {
       "label": "srdi r3, r4, 8",
-      "raw": "0x7883E200",
+      "raw": "0x7883C202",
       "addr": "0x82000000",
       "mnemonic": "rldicl",
       "operands": "r3, r4, 56, 8",
@@ -191,7 +191,7 @@
     },
     {
       "label": "rotldi r3, r4, 8",
-      "raw": "0x78832000",
+      "raw": "0x78834000",
       "addr": "0x82000000",
       "mnemonic": "rldicl",
       "operands": "r3, r4, 8, 0",

From 75544fa9db8a1e6cf7ec228afca2c75f766afad0 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 1 May 2026 21:01:03 +0200
Subject: [PATCH 2/6] fix(cpu): PPCBUG-046 PPCBUG-561 add mb_md() accessor; fix
 all 6 rld* mb fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPCBUG-561: Add DecodedInstr::mb_md() to decoder.rs — the correct MD-form
6-bit mask-begin reconstruction (MB[4:0] at PPC bits 21-25, MB[5] at PPC
bit 26). The disassembler already had the correct local formula; this
promotes it to a single source of truth on DecodedInstr.

PPCBUG-046: All 6 doubleword-rotate arms (rldicl, rldicr, rldic, rldimi,
rldcl, rldcr) inlined "(instr.mb() << 1) | ((instr.raw >> 1) & 1)" which
reads SH5 (host bit 1) instead of MB5 (host bit 5). For the canonical
"clrldi r3, r4, 32" zero-extend idiom (mb=32 → MB5=1, MB[4:0]=0), the
wrong formula produced mb=0, making the instruction a no-op and leaving
upper 32 bits of the GPR polluted. Replace all 6 sites with instr.mb_md().

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/xenia-cpu/src/decoder.rs     |  6 ++++
 crates/xenia-cpu/src/interpreter.rs | 56 +++++++++++++++++++++++++----
 2 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index 28e55dc..80a5a7e 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -92,6 +92,12 @@ impl DecodedInstr {
         (extract_bits(self.raw, 30, 30) << 5) | extract_bits(self.raw, 16, 20)
     }
 
+    /// MB/ME field for MD-form and MDS-form instructions (6-bit field, split encoding).
+    /// MB[4:0] at PPC bits 21-25; MB[5] at PPC bit 26.
+    #[inline] pub fn mb_md(&self) -> u32 {
+        extract_bits(self.raw, 21, 25) | (extract_bits(self.raw, 26, 26) << 5)
+    }
+
     /// SPR field (bits 11-20, swapped halves)
     #[inline] pub fn spr(&self) -> u32 {
         let spr_raw = extract_bits(self.raw, 11, 20);
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index c22cd0b..90177f2 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -693,7 +693,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         PpcOpcode::rldiclx => {
             let rs = ctx.gpr[instr.rs()];
             let sh = instr.sh64();
-            let mb = (instr.mb() << 1) | ((instr.raw >> 1) & 1); // 6-bit mb
+            let mb = instr.mb_md();
             let rotated = rs.rotate_left(sh);
             let mask = rld_mask_left(mb);
             ctx.gpr[instr.ra()] = rotated & mask;
@@ -703,7 +703,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         PpcOpcode::rldicrx => {
             let rs = ctx.gpr[instr.rs()];
             let sh = instr.sh64();
-            let me = (instr.mb() << 1) | ((instr.raw >> 1) & 1); // 6-bit me
+            let me = instr.mb_md();
             let rotated = rs.rotate_left(sh);
             let mask = rld_mask_right(me);
             ctx.gpr[instr.ra()] = rotated & mask;
@@ -713,7 +713,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         PpcOpcode::rldicx => {
             let rs = ctx.gpr[instr.rs()];
             let sh = instr.sh64();
-            let mb = (instr.mb() << 1) | ((instr.raw >> 1) & 1);
+            let mb = instr.mb_md();
             let rotated = rs.rotate_left(sh);
             let mask = rld_mask_left(mb) & rld_mask_right(63 - sh);
             ctx.gpr[instr.ra()] = rotated & mask;
@@ -723,7 +723,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         PpcOpcode::rldimix => {
             let rs = ctx.gpr[instr.rs()];
             let sh = instr.sh64();
-            let mb = (instr.mb() << 1) | ((instr.raw >> 1) & 1);
+            let mb = instr.mb_md();
             let rotated = rs.rotate_left(sh);
             let mask = rld_mask_left(mb) & rld_mask_right(63 - sh);
             ctx.gpr[instr.ra()] = (rotated & mask) | (ctx.gpr[instr.ra()] & !mask);
@@ -733,7 +733,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         PpcOpcode::rldclx => {
             let rs = ctx.gpr[instr.rs()];
             let sh = ctx.gpr[instr.rb()] & 0x3F;
-            let mb = (instr.mb() << 1) | ((instr.raw >> 1) & 1);
+            let mb = instr.mb_md();
             let rotated = rs.rotate_left(sh as u32);
             let mask = rld_mask_left(mb);
             ctx.gpr[instr.ra()] = rotated & mask;
@@ -743,7 +743,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         PpcOpcode::rldcrx => {
             let rs = ctx.gpr[instr.rs()];
             let sh = ctx.gpr[instr.rb()] & 0x3F;
-            let me = (instr.mb() << 1) | ((instr.raw >> 1) & 1);
+            let me = instr.mb_md();
             let rotated = rs.rotate_left(sh as u32);
             let mask = rld_mask_right(me);
             ctx.gpr[instr.ra()] = rotated & mask;
@@ -6252,4 +6252,48 @@ mod tests {
         expected[4] = 0xAB;
         assert_eq!(ctx.vr[3].as_bytes(), expected);
     }
+
+    // ===== PPCBUG-046 / PPCBUG-561: rldicl / clrldi mb_md fix =====
+
+    /// Encode rldicl (MD-form, opcode=30, XO=0) in host bit notation.
+    /// rs: source register, ra: dest register, sh: shift amount (6-bit),
+    /// mb: mask-begin (6-bit), rc: record bit.
+    fn encode_rldicl(rs: u32, ra: u32, sh: u32, mb: u32, rc: u32) -> u32 {
+        (30 << 26)
+            | (rs << 21)
+            | (ra << 16)
+            | ((sh & 0x1F) << 11)
+            | ((mb & 0x1F) << 6)
+            | (((mb >> 5) & 1) << 5)
+            | (((sh >> 5) & 1) << 1)
+            | (rc & 1)
+    }
+
+    #[test]
+    fn clrldi_zero_extends_low_32_bits() {
+        // clrldi r3, r4, 32 = rldicl r3, r4, 0, 32, 0
+        // After PPCBUG-046 fix: mask must be 0x00000000_FFFFFFFF (mb=32 → mask from bit 32 to 63)
+        // If mb=32 was decoded as mb=0, the mask would be all-ones and the result would be 0xDEAD_BEEF_CAFE_BABE (no-op)
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        ctx.gpr[4] = 0xDEAD_BEEF_CAFE_BABE_u64;
+        let raw = encode_rldicl(4, 3, 0, 32, 0); // sh=0, mb=32
+        write_instr(&mem, 0x100, raw);
+        ctx.pc = 0x100;
+        step(&mut ctx, &mem);
+        assert_eq!(ctx.gpr[3], 0x0000_0000_CAFE_BABE, "clrldi must zero-extend low 32 bits");
+    }
+
+    #[test]
+    fn rldicl_mb32_leaves_low_32_clean() {
+        // Same as above but verify upper 32 are zeroed
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        ctx.gpr[5] = 0xFFFF_FFFF_1234_5678_u64;
+        let raw = encode_rldicl(5, 6, 0, 32, 0);
+        write_instr(&mem, 0x100, raw);
+        ctx.pc = 0x100;
+        step(&mut ctx, &mem);
+        assert_eq!(ctx.gpr[6], 0x0000_0000_1234_5678_u64);
+    }
 }

From d51b9346dfc75392118e0bf20a0f17962dc7acac Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 1 May 2026 21:15:06 +0200
Subject: [PATCH 3/6] fix(cpu): PPCBUG-275 276 420 421 422 423 562 600 fix vcmp
 Rc bit + decode dot forms

PPCBUG-562: Add vc_rc_bit() (PPC bit 21) and vx128r_rc_bit() (PPC bit 27)
to decoder.rs. The generic rc_bit() reads bit 0 (PPC bit 31); all vcmp XO
values are even so bit 0 is always 0, making CR6 permanently dead.

PPCBUG-275/276/420/421: Replace rc_bit() with vc_rc_bit() at all 8 pure
VC-form vcmp arms (vcmpequb, vcmpequh, vcmpgtub, vcmpgtsb, vcmpgtuh,
vcmpgtsh, vcmpgtuw, vcmpgtsw) and with the correct per-form accessor at
the 4 combined arms (vcmpeqfp|128, vcmpgefp|128, vcmpgtfp|128,
vcmpequw|128) and vcmpbfp|128.

PPCBUG-422: VX128_R-form 128-variants in combined arms now use
vx128r_rc_bit() instead of vc_rc_bit().

PPCBUG-423/600: Add 5 dot-form key entries to decode_op6 so
vcmp*fp128./vcmpequw128. decode as the correct opcode instead of Invalid.
Uses a 5-bit key (bits22-24 + bit25 + bit27) for dot-forms to avoid
aliasing against the shift/merge group (which sets bit25=1 when bit27=1).
Interpreter uses vx128r_rc_bit() to conditionally update CR6.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/xenia-cpu/src/decoder.rs     | 25 +++++++-
 crates/xenia-cpu/src/interpreter.rs | 88 ++++++++++++++++++++++++-----
 2 files changed, 97 insertions(+), 16 deletions(-)

diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index 80a5a7e..8299447 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -74,6 +74,11 @@ impl DecodedInstr {
     /// Rc bit (bit 31) - record CR0
     #[inline] pub fn rc_bit(&self) -> bool { self.raw & 1 != 0 }
 
+    /// Rc for VC-form vector compare instructions — PPC bit 21 = host bit 10.
+    #[inline] pub fn vc_rc_bit(&self) -> bool { (self.raw >> 10) & 1 != 0 }
+    /// Rc for VX128_R-form vector compare instructions — PPC bit 27 = host bit 4.
+    #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 4) & 1 != 0 }
+
     /// OE bit (bit 21) - overflow enable
     #[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 }
 
@@ -642,9 +647,13 @@ fn decode_op6(code: u32) -> PpcOpcode {
         _ => {}
     }
 
-    // VMX128 compare
-    let key4 = (extract_bits(code, 22, 24) << 3) | extract_bits(code, 27, 27);
-    match key4 {
+    // VMX128 compare (non-dot and dot forms).
+    // Non-dot: bit 27 = 0.  Dot: bit 27 = 1, but bit 25 must also be 0 to
+    // distinguish from the shift/merge group (which has bit 25 = 1 when bit 27 = 1).
+    // key4_nd uses bits 22-24 + bit 27 (same as original, covers non-dot).
+    // key4_dt uses bits 22-24 + bit 25 + bit 27 (narrower, covers dot-only).
+    let key4_nd = (extract_bits(code, 22, 24) << 3) | extract_bits(code, 27, 27);
+    match key4_nd {
         0b000000 => return PpcOpcode::vcmpeqfp128,
         0b001000 => return PpcOpcode::vcmpgefp128,
         0b010000 => return PpcOpcode::vcmpgtfp128,
@@ -652,6 +661,16 @@ fn decode_op6(code: u32) -> PpcOpcode {
         0b100000 => return PpcOpcode::vcmpequw128,
         _ => {}
     }
+    // Dot forms: bit 27 = 1, bit 25 = 0  (key = bits22-24 + bit25 + bit27, low 3 bits)
+    let key4_dt = (extract_bits(code, 22, 24) << 2) | (extract_bits(code, 25, 25) << 1) | extract_bits(code, 27, 27);
+    match key4_dt {
+        0b00001 => return PpcOpcode::vcmpeqfp128,   // bits22-24=000, bit25=0, bit27=1
+        0b00101 => return PpcOpcode::vcmpgefp128,   // bits22-24=001, bit25=0, bit27=1
+        0b01001 => return PpcOpcode::vcmpgtfp128,   // bits22-24=010, bit25=0, bit27=1
+        0b01101 => return PpcOpcode::vcmpbfp128,    // bits22-24=011, bit25=0, bit27=1
+        0b10001 => return PpcOpcode::vcmpequw128,   // bits22-24=100, bit25=0, bit27=1
+        _ => {}
+    }
 
     // VMX128 shift/merge
     let key5 = (extract_bits(code, 22, 25) << 2) | extract_bits(code, 27, 27);
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 90177f2..65c6a9d 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2036,7 +2036,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u32; 4];
             for i in 0..4 { r[i] = if a[i] == b[i] { 0xFFFF_FFFF } else { 0 }; }
             ctx.vr[vd] = xenia_types::Vec128::from_u32x4_array(r);
-            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            let rc = if matches!(instr.opcode, PpcOpcode::vcmpeqfp128) { instr.vx128r_rc_bit() } else { instr.vc_rc_bit() };
+            if rc { update_cr6_from_vmask(&r, ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::vcmpgefp | PpcOpcode::vcmpgefp128 => {
@@ -2046,7 +2047,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u32; 4];
             for i in 0..4 { r[i] = if a[i] >= b[i] { 0xFFFF_FFFF } else { 0 }; }
             ctx.vr[vd] = xenia_types::Vec128::from_u32x4_array(r);
-            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            let rc = if matches!(instr.opcode, PpcOpcode::vcmpgefp128) { instr.vx128r_rc_bit() } else { instr.vc_rc_bit() };
+            if rc { update_cr6_from_vmask(&r, ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::vcmpgtfp | PpcOpcode::vcmpgtfp128 => {
@@ -2056,7 +2058,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u32; 4];
             for i in 0..4 { r[i] = if a[i] > b[i] { 0xFFFF_FFFF } else { 0 }; }
             ctx.vr[vd] = xenia_types::Vec128::from_u32x4_array(r);
-            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            let rc = if matches!(instr.opcode, PpcOpcode::vcmpgtfp128) { instr.vx128r_rc_bit() } else { instr.vc_rc_bit() };
+            if rc { update_cr6_from_vmask(&r, ctx); }
             ctx.pc += 4;
         }
 
@@ -2398,7 +2401,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u32; 4];
             for i in 0..4 { r[i] = if a[i] == b[i] { 0xFFFF_FFFF } else { 0 }; }
             ctx.vr[vd] = xenia_types::Vec128::from_u32x4_array(r);
-            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            let rc = if matches!(instr.opcode, PpcOpcode::vcmpequw128) { instr.vx128r_rc_bit() } else { instr.vc_rc_bit() };
+            if rc { update_cr6_from_vmask(&r, ctx); }
             ctx.pc += 4;
         }
 
@@ -3528,7 +3532,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u8; 16];
             for i in 0..16 { r[i] = if a[i] == b[i] { 0xFF } else { 0 }; }
             let v = xenia_types::Vec128::from_bytes(r);
-            if instr.rc_bit() {
+            if instr.vc_rc_bit() {
                 let (t, f) = crate::vmx::cr6_flags_from_mask(v);
                 ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
             }
@@ -3541,7 +3545,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u16; 8];
             for i in 0..8 { r[i] = if a[i] == b[i] { 0xFFFF } else { 0 }; }
             let v = xenia_types::Vec128::from_u16x8_array(r);
-            if instr.rc_bit() {
+            if instr.vc_rc_bit() {
                 let (t, f) = crate::vmx::cr6_flags_from_mask(v);
                 ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
             }
@@ -3554,7 +3558,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u8; 16];
             for i in 0..16 { r[i] = if a[i] > b[i] { 0xFF } else { 0 }; }
             let v = xenia_types::Vec128::from_bytes(r);
-            if instr.rc_bit() {
+            if instr.vc_rc_bit() {
                 let (t, f) = crate::vmx::cr6_flags_from_mask(v);
                 ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
             }
@@ -3567,7 +3571,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u8; 16];
             for i in 0..16 { r[i] = if a[i] > b[i] { 0xFF } else { 0 }; }
             let v = xenia_types::Vec128::from_bytes(r);
-            if instr.rc_bit() {
+            if instr.vc_rc_bit() {
                 let (t, f) = crate::vmx::cr6_flags_from_mask(v);
                 ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
             }
@@ -3580,7 +3584,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u16; 8];
             for i in 0..8 { r[i] = if a[i] > b[i] { 0xFFFF } else { 0 }; }
             let v = xenia_types::Vec128::from_u16x8_array(r);
-            if instr.rc_bit() {
+            if instr.vc_rc_bit() {
                 let (t, f) = crate::vmx::cr6_flags_from_mask(v);
                 ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
             }
@@ -3593,7 +3597,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u16; 8];
             for i in 0..8 { r[i] = if a[i] > b[i] { 0xFFFF } else { 0 }; }
             let v = xenia_types::Vec128::from_u16x8_array(r);
-            if instr.rc_bit() {
+            if instr.vc_rc_bit() {
                 let (t, f) = crate::vmx::cr6_flags_from_mask(v);
                 ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
             }
@@ -3606,7 +3610,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u32; 4];
             for i in 0..4 { r[i] = if a[i] > b[i] { 0xFFFFFFFF } else { 0 }; }
             let v = xenia_types::Vec128::from_u32x4_array(r);
-            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            if instr.vc_rc_bit() { update_cr6_from_vmask(&r, ctx); }
             ctx.vr[instr.rd()] = v;
             ctx.pc += 4;
         }
@@ -3616,7 +3620,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             let mut r = [0u32; 4];
             for i in 0..4 { r[i] = if a[i] > b[i] { 0xFFFFFFFF } else { 0 }; }
             let v = xenia_types::Vec128::from_u32x4_array(r);
-            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            if instr.vc_rc_bit() { update_cr6_from_vmask(&r, ctx); }
             ctx.vr[instr.rd()] = v;
             ctx.pc += 4;
         }
@@ -3638,7 +3642,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 if a[i].is_nan() || b[i].is_nan() || a[i] < -b[i] { lane |= 0x4000_0000; any_out = true; }
                 r[i] = lane;
             }
-            if instr.rc_bit() {
+            let rc = if is_128 { instr.vx128r_rc_bit() } else { instr.vc_rc_bit() };
+            if rc {
                 ctx.cr[6] = crate::context::CrField {
                     lt: false, gt: false, eq: !any_out, so: false,
                 };
@@ -6296,4 +6301,61 @@ mod tests {
         step(&mut ctx, &mem);
         assert_eq!(ctx.gpr[6], 0x0000_0000_1234_5678_u64);
     }
+
+    // ===== PPCBUG-275/276/562: vc_rc_bit fix for VC-form vcmpequb =====
+
+    /// VC-form: opcode=4 (VMX), vD at 6-10, vA at 11-15, vB at 16-20, Rc at PPC bit 21 = host bit 10, XO=6.
+    /// vcmpequb.: (4<<26)|(vD<<21)|(vA<<16)|(vB<<11)|(1<<10)|6
+    fn encode_vcmpequb_dot(vd: u32, va: u32, vb: u32) -> u32 {
+        (4 << 26) | (vd << 21) | (va << 16) | (vb << 11) | (1 << 10) | 6
+    }
+    /// vcmpequb (no dot form): same but Rc=0
+    fn encode_vcmpequb(vd: u32, va: u32, vb: u32) -> u32 {
+        (4 << 26) | (vd << 21) | (va << 16) | (vb << 11) | 6
+    }
+
+    #[test]
+    fn vcmpequb_dot_all_true_sets_cr6_lt() {
+        // All bytes equal → all lanes 0xFF → CR6.LT=1 (all-true), CR6.EQ=0
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        let v = xenia_types::Vec128::from_bytes([0xAAu8; 16]);
+        ctx.vr[1] = v;
+        ctx.vr[2] = v;
+        write_instr(&mem, 0x100, encode_vcmpequb_dot(0, 1, 2));
+        ctx.pc = 0x100;
+        step(&mut ctx, &mem);
+        assert!(ctx.cr[6].lt, "all-true: CR6.LT must be 1");
+        assert!(!ctx.cr[6].eq, "all-true: CR6.EQ must be 0");
+    }
+
+    #[test]
+    fn vcmpequb_no_dot_does_not_update_cr6() {
+        // Without dot form, CR6 must be unchanged
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        ctx.cr[6] = crate::context::CrField { lt: true, gt: false, eq: true, so: false };
+        let v = xenia_types::Vec128::from_bytes([0xAAu8; 16]);
+        ctx.vr[1] = v;
+        ctx.vr[2] = v;
+        write_instr(&mem, 0x100, encode_vcmpequb(0, 1, 2));
+        ctx.pc = 0x100;
+        step(&mut ctx, &mem);
+        // CR6 unchanged: no dot form
+        assert!(ctx.cr[6].lt && ctx.cr[6].eq, "CR6 must be unchanged without dot");
+    }
+
+    #[test]
+    fn vcmpequb_dot_all_false_sets_cr6_eq() {
+        // No bytes equal → all lanes 0x00 → CR6.LT=0, CR6.EQ=1 (all-false)
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_bytes([0xAAu8; 16]);
+        ctx.vr[2] = xenia_types::Vec128::from_bytes([0xBBu8; 16]);
+        write_instr(&mem, 0x100, encode_vcmpequb_dot(0, 1, 2));
+        ctx.pc = 0x100;
+        step(&mut ctx, &mem);
+        assert!(!ctx.cr[6].lt, "all-false: CR6.LT must be 0");
+        assert!(ctx.cr[6].eq, "all-false: CR6.EQ must be 1");
+    }
 }

From 197d76c44e3155b8154b4937bca5165a24453c8f Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 1 May 2026 21:26:26 +0200
Subject: [PATCH 4/6] fix(cpu): PPCBUG-315 PPCBUG-563 fix vrlimi128 z and IMM
 field extraction

PPCBUG-563: Add vx128_4_imm() (PPC bits 11-15) and vx128_4_z() (PPC bits
24-25) accessors to decoder.rs for VX128_4-form instructions.

PPCBUG-315: vrlimi128 was reading z from host bits 16-17 (a subset of IMM)
and mask from host bits 2-5 (a reserved/XO region). Replace with the
correct accessors: z selects which word-lane to start the rotation from
(0-3); IMM is the 5-bit per-lane blend mask.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/xenia-cpu/src/decoder.rs     | 107 ++++++++++++++++++++++++++++
 crates/xenia-cpu/src/interpreter.rs |  12 ++--
 2 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index 8299447..718aa9b 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -79,6 +79,11 @@ impl DecodedInstr {
     /// Rc for VX128_R-form vector compare instructions — PPC bit 27 = host bit 4.
     #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 4) & 1 != 0 }
 
+    /// IMM field for VX128_4-form instructions (vrlimi128) — 5-bit blend mask at PPC bits 11-15.
+    #[inline] pub fn vx128_4_imm(&self) -> u32 { extract_bits(self.raw, 11, 15) }
+    /// z field for VX128_4-form instructions (vrlimi128) — 2-bit rotation index at PPC bits 24-25.
+    #[inline] pub fn vx128_4_z(&self) -> u32 { extract_bits(self.raw, 24, 25) }
+
     /// OE bit (bit 21) - overflow enable
     #[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 }
 
@@ -152,8 +157,20 @@ impl DecodedInstr {
     /// VS128 - same encoding as VD128
     #[inline] pub fn vs128(&self) -> usize { self.vd128() }
 
+    /// VC register for VX128_2-form instructions (vperm128) — 3-bit at PPC bits 23-25.
+    #[inline] pub fn vc128_2(&self) -> usize { extract_bits(self.raw, 23, 25) as usize }
+
     /// NB field (bits 16-20) for lswi/stswi
     #[inline] pub fn nb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
+
+    /// PERM field for VX128_P-form instructions (vpermwi128) — 8-bit split encoding.
+    /// PERMl (5 bits) at PPC bits 11-15; PERMh (3 bits) at PPC bits 23-25.
+    #[inline] pub fn vx128_p_perm(&self) -> u32 {
+        extract_bits(self.raw, 11, 15) | (extract_bits(self.raw, 23, 25) << 5)
+    }
+
+    /// SH field for VX128_5-form instructions (vsldoi128) — 4-bit shift at PPC bits 22-25.
+    #[inline] pub fn vx128_5_sh(&self) -> u32 { extract_bits(self.raw, 22, 25) }
 }
 
 /// Extract the 5-bit `UIMM` (`VX128_3`) / `IMM` (`VX128_4`) field. Canary
@@ -1129,4 +1146,94 @@ mod tests {
         let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
         assert_eq!(d.vd128(), 5 | 32 | 64);
     }
+
+    #[test]
+    fn vx128_5_sh_bit_positions() {
+        // SH=8 (binary 1000): bit 3 = 1, bits 0-2 = 0.
+        // Host bit 9 = 1 (PPC bit 22), host bits 6-8 = 0.
+        // So raw bit 9 set = raw |= 1 << 9 = 0x200
+        let raw = 0x200u32; // host bit 9 set only
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vx128_5_sh(), 8, "SH=8: MSB at PPC bit 22");
+
+        // SH=1 (binary 0001): host bit 6 set = raw |= 1 << 6 = 0x40
+        let raw = 0x40u32;
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vx128_5_sh(), 1, "SH=1: LSB at PPC bit 25");
+
+        // SH=15 (binary 1111): host bits 6-9 all set = raw |= 0xF << 6 = 0x3C0
+        let raw = 0x3C0u32;
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vx128_5_sh(), 15, "SH=15: all 4 bits set");
+
+        // SH=0: raw=0
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 0, addr: 0 };
+        assert_eq!(d.vx128_5_sh(), 0, "SH=0");
+    }
+
+    #[test]
+    fn vx128_4_accessors_correct_bit_positions() {
+        // z=3 (binary 11) at PPC bits 24-25 = host bits 6-7
+        let raw = 0b11u32 << 6;
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vx128_4_z(), 3, "z=3 from host bits 6-7");
+
+        // IMM=0x15 (binary 10101) at PPC bits 11-15 = host bits 16-20
+        let raw2 = 0x15u32 << 16;
+        let d2 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: raw2, addr: 0 };
+        assert_eq!(d2.vx128_4_imm(), 0x15, "IMM=0x15 from host bits 16-20");
+
+        // Combined: z=1, IMM=0xA — fields must not bleed into each other
+        let raw3 = (0x1u32 << 6) | (0xAu32 << 16);
+        let d3 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: raw3, addr: 0 };
+        assert_eq!(d3.vx128_4_z(), 1, "z=1 combined");
+        assert_eq!(d3.vx128_4_imm(), 0xA, "IMM=0xA combined");
+
+        // z=2, IMM=0xF — max 4-bit blend mask, exercises the full lower nibble
+        let raw4 = (0b10u32 << 6) | (0xFu32 << 16);
+        let d4 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: raw4, addr: 0 };
+        assert_eq!(d4.vx128_4_z(), 2, "z=2 from binary 10");
+        assert_eq!(d4.vx128_4_imm(), 0xF, "IMM=0xF all-ones nibble");
+    }
+
+    #[test]
+    fn vc128_2_extracts_ppc_bits_23_25() {
+        // VC=5 (binary 101) at PPC bits 23-25 = host bits 6-8
+        // extract_bits(raw, 23, 25) = (raw >> (31-25)) & 0x7 = (raw >> 6) & 0x7
+        let raw = 5u32 << 6; // host bits 6-8 = 5
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vc128_2(), 5);
+
+        let d0 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 0, addr: 0 };
+        assert_eq!(d0.vc128_2(), 0);
+
+        let d7 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 7u32 << 6, addr: 0 };
+        assert_eq!(d7.vc128_2(), 7);
+
+        let d1 = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 1u32 << 6, addr: 0 };
+        assert_eq!(d1.vc128_2(), 1);
+    }
+
+    #[test]
+    fn vx128_p_perm_assembles_correctly() {
+        // PERMl=0x1F (all 5 bits set) at host bits 16-20: raw = 0x1F << 16
+        let raw = 0x1Fu32 << 16;
+        assert_eq!(DecodedInstr::from_raw(raw).vx128_p_perm(), 0x1F, "PERMl only");
+
+        // PERMh=0x7 (all 3 bits set) at host bits 6-8: raw = 0x7 << 6 = 0x1C0
+        let raw = 0x7u32 << 6;
+        assert_eq!(
+            DecodedInstr::from_raw(raw).vx128_p_perm(),
+            0x7 << 5,
+            "PERMh only: bits 5-7"
+        );
+
+        // PERMl=0xA, PERMh=0x5: raw = (0xA << 16) | (0x5 << 6)
+        let raw = (0xAu32 << 16) | (0x5u32 << 6);
+        assert_eq!(DecodedInstr::from_raw(raw).vx128_p_perm(), 0xA | (0x5 << 5));
+
+        // PERMl and PERMh bits must not bleed into each other
+        let raw = 0u32;
+        assert_eq!(DecodedInstr::from_raw(raw).vx128_p_perm(), 0);
+    }
 }
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 65c6a9d..fee3a07 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2141,9 +2141,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 va = instr.va128();
                 vb = instr.vb128();
                 vd = instr.vd128();
-                // For vperm128, the permutation control is in vC (third source)
-                // which is typically encoded via a different field
-                vc = instr.vd128(); // vperm128 uses vD as permute mask
+                vc = instr.vc128_2();
             } else {
                 va = instr.ra();
                 vb = instr.rb();
@@ -2176,7 +2174,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         PpcOpcode::vsldoi128 => {
             let a_bytes = ctx.vr[instr.va128()].as_bytes();
             let b_bytes = ctx.vr[instr.vb128()].as_bytes();
-            let sh = ((instr.raw >> 6) & 0x7) as usize | (((instr.raw >> 4) & 0x1) as usize) << 3; // extract shift
+            let sh = instr.vx128_5_sh() as usize;
             let mut concat = [0u8; 32];
             concat[..16].copy_from_slice(&a_bytes);
             concat[16..].copy_from_slice(&b_bytes);
@@ -3766,8 +3764,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         // use rotated[N]). Titles generally use mask=0xF (copy-all) which
         // makes this behave like a plain word rotate.
         PpcOpcode::vrlimi128 => {
-            let shift = ((instr.raw >> 16) & 0x3) as usize;
-            let mask = (instr.raw >> 2) & 0xF; // VX128_4 "fmask"
+            let shift = instr.vx128_4_z() as usize;
+            let mask = instr.vx128_4_imm();
             let b = ctx.vr[instr.vb128()].as_u32x4();
             let d = ctx.vr[instr.vd128()].as_u32x4();
             let rot = [b[shift % 4], b[(shift + 1) % 4], b[(shift + 2) % 4], b[(shift + 3) % 4]];
@@ -4304,7 +4302,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         }
         // vpermwi128: permute words of vB using an 8-bit immediate (2 bits per output lane).
         PpcOpcode::vpermwi128 => {
-            let imm = (instr.raw >> 16) & 0xFF;
+            let imm = instr.vx128_p_perm();
             let b = ctx.vr[instr.vb128()].as_u32x4();
             let mut r = [0u32; 4];
             // Output lane i ← b[(imm >> (2 * (3-i))) & 3]

From 64e8ecbfd0145cd5dc6c20f87f1e6e8769bdf91f Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 1 May 2026 21:29:12 +0200
Subject: [PATCH 5/6] fix(cpu): PPCBUG-361 PPCBUG-565 fix vsldoi128 SH field
 extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PPCBUG-565: Add vx128_5_sh() to decoder.rs — 4-bit shift at PPC bits
22-25 (host bits 6-9). The correct MSB is at PPC bit 22 (host bit 9).

PPCBUG-361: vsldoi128 was reading the SH MSB from host bit 4 (PPC bit
27, reserved) instead of host bit 9 (PPC bit 22). All shift amounts >= 8
decoded incorrectly (e.g. shift=8 executed as shift=0). Replace the
inline bit-shuffle with instr.vx128_5_sh().

Also fix vx128_p_perm_assembles_correctly test: replace nonexistent
DecodedInstr::from_raw() calls with struct literal construction.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/xenia-cpu/src/decoder.rs | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index 718aa9b..ef28270 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -1218,22 +1218,21 @@ mod tests {
     fn vx128_p_perm_assembles_correctly() {
         // PERMl=0x1F (all 5 bits set) at host bits 16-20: raw = 0x1F << 16
         let raw = 0x1Fu32 << 16;
-        assert_eq!(DecodedInstr::from_raw(raw).vx128_p_perm(), 0x1F, "PERMl only");
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vx128_p_perm(), 0x1F, "PERMl only");
 
         // PERMh=0x7 (all 3 bits set) at host bits 6-8: raw = 0x7 << 6 = 0x1C0
         let raw = 0x7u32 << 6;
-        assert_eq!(
-            DecodedInstr::from_raw(raw).vx128_p_perm(),
-            0x7 << 5,
-            "PERMh only: bits 5-7"
-        );
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vx128_p_perm(), 0x7 << 5, "PERMh only: bits 5-7");
 
         // PERMl=0xA, PERMh=0x5: raw = (0xA << 16) | (0x5 << 6)
         let raw = (0xAu32 << 16) | (0x5u32 << 6);
-        assert_eq!(DecodedInstr::from_raw(raw).vx128_p_perm(), 0xA | (0x5 << 5));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vx128_p_perm(), 0xA | (0x5 << 5));
 
         // PERMl and PERMh bits must not bleed into each other
-        let raw = 0u32;
-        assert_eq!(DecodedInstr::from_raw(raw).vx128_p_perm(), 0);
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw: 0, addr: 0 };
+        assert_eq!(d.vx128_p_perm(), 0);
     }
 }

From 6b9de17925083374af771bd0335182d666fc4f44 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 1 May 2026 22:06:00 +0200
Subject: [PATCH 6/6] fix(cpu): PPCBUG-363 PPCBUG-369 vpkd3d128 post-pack
 permutation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vpkd3d128 was storing the pack codec output directly into vd128 without
applying the MakePermuteMask permutation that merges the packed scalar(s)
into the previous register value according to pack (slot layout) and shift
(destination lane offset).

PPCBUG-363: vpkd3d128 was missing the post-pack lane-placement step.
PPCBUG-369: vpkd3d128 pack field not extracted; pack=0 still worked
  (identity), but pack=1/2/3 always wrote raw out instead of blending.

Fix: extract `pack = uimm & 3` and `shift = instr.vx128_4_z()` from the
VX128_4 IMM and z fields. For pack==0 (identity) store out directly as
before. For pack 1-3, read the existing vd128 value and select 4 u32
words from {prev, out} using the 3×4 static permutation tables from
canary ppc_emit_altivec.cc:2126-2188.

Tables derived from canary MakePermuteMask(r0,l0,…r3,l3):
  pack=1 (VPACK_32): out[3] placed at lane (3-shift), prev elsewhere
  pack=2 (64-bit):   out[2..3] placed at lanes (2-shift)..(3-shift)
  pack=3 (64-bit):   same as pack=2 except shift=3 → out[2] at lane 3

Tests: vpkd3d128_pack0_legacy_unchanged, vpkd3d128_pack1_shift0_d3d_vertex_pack,
       vpkd3d128_pack1_shift3_puts_out3_at_lane0

interpreter.rs: vpkd3d128 arm (~line 3999)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 119 ++++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 6 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index fee3a07..3bb3d56 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -3991,14 +3991,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         // position) and masked to only 3 bits. Canary extracts from the
         // VX128_3/4 `IMM` field at PPC bits 16-22 (MSB) and does
         // `type = IMM >> 2` to pick up the 5-bit type selector — the low
-        // 2 bits (`pack`) select output-slot layout for `vpkd3d128` and
-        // are ignored by `vupkd3d128`. Extracting the low 2 bits as
-        // `pack` (unused here — we hand back the codec output in its
-        // canonical lane position, the subsequent permute instruction
-        // handles placement) for completeness.
+        // 2 bits (`pack`) select output-slot layout for `vpkd3d128`.
         PpcOpcode::vpkd3d128 => {
             use crate::vmx::D3dPackType;
             let uimm = crate::decoder::extract_vx128_uimm5(instr.raw);
+            let pack = (uimm & 3) as usize;
+            let shift = instr.vx128_4_z() as usize;
             let ty = D3dPackType::from_immediate(uimm >> 2);
             let src = ctx.vr[instr.vb128()];
             let out = match ty {
@@ -4020,7 +4018,36 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                     src
                 }
             };
-            ctx.vr[instr.vd128()] = out;
+            // Post-pack permutation: merge packed `out` into previous `vd`
+            // per canary ppc_emit_altivec.cc:2126-2188 MakePermuteMask tables.
+            // MakePermuteMask(r0,l0, r1,l1, r2,l2, r3,l3): result[i] = if ri==0 { prev[li] } else { out[li] }
+            let result = if pack == 0 {
+                out
+            } else {
+                // (source_reg, lane): 0=prev vd, 1=packed out
+                const PERM: [[[(u8, u8); 4]; 4]; 3] = [
+                    // pack=1 (VPACK_32): places out[3] at lane (3-shift)
+                    [[(0,0),(0,1),(0,2),(1,3)], [(0,0),(0,1),(1,3),(0,3)],
+                     [(0,0),(1,3),(0,2),(0,3)], [(1,3),(0,1),(0,2),(0,3)]],
+                    // pack=2 (64-bit): places out[2..3] at lanes (2-shift)..(3-shift)
+                    [[(0,0),(0,1),(1,2),(1,3)], [(0,0),(1,2),(1,3),(0,3)],
+                     [(1,2),(1,3),(0,2),(0,3)], [(1,3),(0,1),(0,2),(0,3)]],
+                    // pack=3 (64-bit): same as pack=2 except shift=3 selects out[2] at lane 3
+                    [[(0,0),(0,1),(1,2),(1,3)], [(0,0),(1,2),(1,3),(0,3)],
+                     [(1,2),(1,3),(0,2),(0,3)], [(0,0),(0,1),(0,2),(1,2)]],
+                ];
+                let prev = ctx.vr[instr.vd128()];
+                let pw = prev.as_u32x4();
+                let ow = out.as_u32x4();
+                let sel = PERM[pack - 1][shift];
+                xenia_types::Vec128::from_u32x4_array([
+                    if sel[0].0 == 0 { pw[sel[0].1 as usize] } else { ow[sel[0].1 as usize] },
+                    if sel[1].0 == 0 { pw[sel[1].1 as usize] } else { ow[sel[1].1 as usize] },
+                    if sel[2].0 == 0 { pw[sel[2].1 as usize] } else { ow[sel[2].1 as usize] },
+                    if sel[3].0 == 0 { pw[sel[3].1 as usize] } else { ow[sel[3].1 as usize] },
+                ])
+            };
+            ctx.vr[instr.vd128()] = result;
             ctx.pc += 4;
         }
         PpcOpcode::vupkd3d128 => {
@@ -6356,4 +6383,84 @@ mod tests {
         assert!(!ctx.cr[6].lt, "all-false: CR6.LT must be 0");
         assert!(ctx.cr[6].eq, "all-false: CR6.EQ must be 1");
     }
+
+    // ---- PPCBUG-363 + PPCBUG-369: vpkd3d128 post-pack permutation ----
+    //
+    // vpkd3d128 VD, VB, type, pack, shift: the low 2 bits of the IMM field
+    // select how the packed scalar/vector is merged back into the previous VD.
+    // pack=0 → identity (store out directly); pack=1 → 32-bit merge by shift;
+    // pack=2,3 → 64-bit merge by shift.
+    // Canary source: ppc_emit_altivec.cc:2126-2188.
+    //
+    // For vpkd3d128, PPC bits 21-22 are always 1 (key2 discriminant), so
+    // vd128 is always in range [96, 127] for vd_lo in [0, 31].
+
+    fn encode_vpkd3d128(vd_lo: u32, vb_lo: u32, imm: u32, z: u32) -> u32 {
+        // op6=6 (all VMX128 compute ops); VD[4:0] at host 25-21; IMM at host 20-16;
+        // VB[4:0] at host 15-11; host bits 10,9 = 1,1 (PPC bits 21,22, key2=0b110);
+        // z (2-bit) at host 7-6 (PPC bits 24-25); host bit 4 = 1 (PPC bit 27, key2 low=0b01).
+        // decode_op6 key2 = (bits21-23<<4)|(bits26-27) = (0b110<<4)|0b01 = 0b1100001.
+        (6u32 << 26) | (vd_lo << 21) | (imm << 16) | (vb_lo << 11)
+            | (1 << 10) | (1 << 9) | (z << 6) | (1 << 4)
+    }
+
+    #[test]
+    fn vpkd3d128_pack0_legacy_unchanged() {
+        // pack=0 → identity: result = out (packed value), no blend with prev vd.
+        // type=0 (D3dColor), pack=0 → IMM=0; z=0 (don't care for pack=0).
+        // vd=96 (vd_lo=0 | bits21=1,22=1→+96).
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        // vb=1: R=1.0, G=0, B=0, A=0 → D3dColor packs to word (0<<24)|(255<<16)|(0<<8)|0 = 0x00FF0000
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0);
+        // prev vd=96: sentinel values that should NOT appear in result
+        ctx.vr[96] = xenia_types::Vec128::from_u32x4(0xAAAA_AAAA, 0xBBBB_BBBB, 0xCCCC_CCCC, 0xDDDD_DDDD);
+        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 0, 0));
+        ctx.pc = 0;
+        step(&mut ctx, &mem);
+        let r = ctx.vr[96].as_u32x4();
+        // out = [0, 0, 0, 0x00FF_0000]; pack=0 → result = out
+        assert_eq!(r[0], 0x0000_0000, "pack=0: lane 0 must be out[0]=0");
+        assert_eq!(r[1], 0x0000_0000, "pack=0: lane 1 must be out[1]=0");
+        assert_eq!(r[2], 0x0000_0000, "pack=0: lane 2 must be out[2]=0");
+        assert_eq!(r[3], 0x00FF_0000, "pack=0: lane 3 must be packed D3dColor");
+    }
+
+    #[test]
+    fn vpkd3d128_pack1_shift0_d3d_vertex_pack() {
+        // pack=1, shift=0 (VPACK_32): out[3] placed at lane 3; prev[0..2] preserved.
+        // MakePermuteMask(0,0, 0,1, 0,2, 1,3) → [prev[0], prev[1], prev[2], out[3]]
+        // IMM = (type=0 D3dColor << 2) | pack=1 = 1; z=0.
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000
+        ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
+        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 0));
+        ctx.pc = 0;
+        step(&mut ctx, &mem);
+        let r = ctx.vr[96].as_u32x4();
+        assert_eq!(r[0], 0x1111_1111, "pack=1 shift=0: lane 0 from prev");
+        assert_eq!(r[1], 0x2222_2222, "pack=1 shift=0: lane 1 from prev");
+        assert_eq!(r[2], 0x3333_3333, "pack=1 shift=0: lane 2 from prev");
+        assert_eq!(r[3], 0x00FF_0000, "pack=1 shift=0: lane 3 from out[3]");
+    }
+
+    #[test]
+    fn vpkd3d128_pack1_shift3_puts_out3_at_lane0() {
+        // pack=1, shift=3 (VPACK_32): out[3] placed at lane 0; prev[1..3] preserved.
+        // MakePermuteMask(1,3, 0,1, 0,2, 0,3) → [out[3], prev[1], prev[2], prev[3]]
+        // IMM = 1; z=3.
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000
+        ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444);
+        write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 3));
+        ctx.pc = 0;
+        step(&mut ctx, &mem);
+        let r = ctx.vr[96].as_u32x4();
+        assert_eq!(r[0], 0x00FF_0000, "pack=1 shift=3: lane 0 from out[3]");
+        assert_eq!(r[1], 0x2222_2222, "pack=1 shift=3: lane 1 from prev");
+        assert_eq!(r[2], 0x3333_3333, "pack=1 shift=3: lane 2 from prev");
+        assert_eq!(r[3], 0x4444_4444, "pack=1 shift=3: lane 3 from prev");
+    }
 }