diff --git a/audit-findings.md b/audit-findings.md index 26cbb97..5bb09cf 100644 --- a/audit-findings.md +++ b/audit-findings.md @@ -3414,3 +3414,38 @@ has the wrong extraction. The disassembler was written independently and got the degenerate cases. IDs PPCBUG-655 through PPCBUG-679 are unallocated — no further bugs found in Phase C3. + +--- + +## Phase C4 — Post-merge audit corrections (2026-05-02) + +### PPCBUG-700 — VMX128 register accessors disagreed with canary's bitfield layout (HIGH) + +- **Severity**: HIGH (silent mis-decoding of any VMX128 instruction with a register >= 32) +- **Status**: applied +- **Locations**: `decoder.rs:138-160` (`va128`/`vb128`/`vd128`), `decoder.rs:80` (`vx128r_rc_bit`) +- **Discovery**: independent reviewer of the P3 phase merge, comparing our rust accessors + against canary's `FormatVX128`/`VX128_2`/`VX128_4`/`VX128_5`/`VX128_R` bitfield struct + in `xenia-canary/src/xenia/cpu/ppc/ppc_decode_data.h:484-663`. +- **Symptom**: this entry contradicts the audit's own line 2958 ("confirmed-clean") + assessment. The previous audit miscounted bit-field offsets — under x86_64 LSB-first + C++ bitfield packing, the canary fields land at: + - `VA128 = VA128l(5) | VA128h(1)<<5 | VA128H(1)<<6` = PPC[11-15] | PPC[26]<<5 | PPC[21]<<6 (3 fields, 7 bits) + - `VB128 = VB128l(5) | VB128h(2)<<5` = PPC[16-20] | PPC[30-31]<<5 (2 fields, 7 bits) + - `VD128 = VD128l(5) | VD128h(2)<<5` = PPC[6-10] | PPC[28-29]<<5 (2 fields, 7 bits) + - `Rc` (VX128_R only) = PPC[25] (host bit 6) — not PPC[27] as PPCBUG-422/562 prescribed. + Rust code instead used va128: PPC[11-15] | PPC[29]<<5 (one bit, wrong position); vb128: + PPC[16-20] | PPC[28]<<5 | PPC[30]<<6 (wrong positions); vd128: PPC[6-10] | PPC[21]<<5 | + PPC[22]<<6 (wrong positions); vx128r_rc_bit at PPC[27]. +- **Why it lurked**: the buggy convention was internally consistent with hand-crafted + test fixtures (which set bit 29 / 21 / 22 to encode "high" registers, matching the + buggy accessor). Real Xbox 360 game code follows canary's convention, so any production + encoding with VR >= 32 was silently mis-decoded — but no unit test exercised that path. +- **Fix**: rewrite the four accessors to canary's bit positions; rewrite the + `vmx128_test_word` helper and unit tests; re-encode the goldens for vmaddfp128/ + vmaddcfp128/vnmsubfp128/vperm128/vsrw128/vpermwi128/vrlimi128. Drop the speculative + `key4_dt` dot-form dispatch in `decode_op6` (canary has no separate dot-form opcodes + for VX128_R compute ops; Rc is a runtime modifier). Update `encode_vpkd3d128` test + helper for canary's VD128h placement. +- **Cross-reference**: invalidates the audit's confirmed-clean note at line 2958. + Subsumes the partial fix-shape proposed in PPCBUG-422 (Rc-bit position). diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs index ef28270..9790f47 100644 --- a/crates/xenia-cpu/src/decoder.rs +++ b/crates/xenia-cpu/src/decoder.rs @@ -77,7 +77,9 @@ impl DecodedInstr { /// Rc for VC-form vector compare instructions — PPC bit 21 = host bit 10. #[inline] pub fn vc_rc_bit(&self) -> bool { (self.raw >> 10) & 1 != 0 } /// Rc for VX128_R-form vector compare instructions — PPC bit 27 = host bit 4. - #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 4) & 1 != 0 } + /// VX128_R Rc bit — PPC bit 25 (host bit 6) per canary's FormatVX128_R + /// bitfield layout. PPCBUG-700. + #[inline] pub fn vx128r_rc_bit(&self) -> bool { (self.raw >> 6) & 1 != 0 } /// IMM field for VX128_4-form instructions (vrlimi128) — 5-bit blend mask at PPC bits 11-15. #[inline] pub fn vx128_4_imm(&self) -> u32 { extract_bits(self.raw, 11, 15) } @@ -133,25 +135,30 @@ impl DecodedInstr { /// crbB (bits 16-20) #[inline] pub fn crbb(&self) -> u32 { extract_bits(self.raw, 16, 20) } - // VMX128 field extractors + // VMX128 field extractors — bit positions match canary's + // FormatVX128/VX128_2/VX128_4/VX128_5/VX128_R bitfield layout + // (xenia-canary `ppc_decode_data.h:484-663`, LSB-first packed). PPCBUG-700. - /// VA128 (bits 6-10, plus bit from 29) + /// VA128 = VA128l(5) | VA128h(1) << 5 | VA128H(1) << 6. + /// Canonical 7-bit register selector: PPC 11-15 (low), PPC 26 (mid), PPC 21 (high). #[inline] pub fn va128(&self) -> usize { - (extract_bits(self.raw, 6, 10) | (extract_bits(self.raw, 29, 29) << 5)) as usize + (extract_bits(self.raw, 11, 15) + | (extract_bits(self.raw, 26, 26) << 5) + | (extract_bits(self.raw, 21, 21) << 6)) as usize } - /// VB128 (bits 16-20, plus bits from 28, 30) + /// VB128 = VB128l(5) | VB128h(2) << 5. Canary's VB128h is a 2-bit + /// contiguous field at PPC 30-31 (host bits 0-1). #[inline] pub fn vb128(&self) -> usize { (extract_bits(self.raw, 16, 20) - | (extract_bits(self.raw, 28, 28) << 5) - | (extract_bits(self.raw, 30, 30) << 6)) as usize + | (extract_bits(self.raw, 30, 31) << 5)) as usize } - /// VD128 (bits 6-10, plus bits from 21, 22) + /// VD128 = VD128l(5) | VD128h(2) << 5. Canary's VD128h is a 2-bit + /// contiguous field at PPC 28-29 (host bits 2-3). #[inline] pub fn vd128(&self) -> usize { (extract_bits(self.raw, 6, 10) - | (extract_bits(self.raw, 21, 21) << 5) - | (extract_bits(self.raw, 22, 22) << 6)) as usize + | (extract_bits(self.raw, 28, 29) << 5)) as usize } /// VS128 - same encoding as VD128 @@ -664,11 +671,11 @@ fn decode_op6(code: u32) -> PpcOpcode { _ => {} } - // VMX128 compare (non-dot and dot forms). - // Non-dot: bit 27 = 0. Dot: bit 27 = 1, but bit 25 must also be 0 to - // distinguish from the shift/merge group (which has bit 25 = 1 when bit 27 = 1). - // key4_nd uses bits 22-24 + bit 27 (same as original, covers non-dot). - // key4_dt uses bits 22-24 + bit 25 + bit 27 (narrower, covers dot-only). + // VMX128 compare (VX128_R form). Single dispatch path: bit 27 = 0 always + // for these opcodes per canary's table (`ppc_opcode_table_gen.cc:295-305`). + // The Rc bit is at PPC 25 (host bit 6) per the FormatVX128_R bitfield — + // it's a runtime modifier read by the interpreter, NOT part of the + // secondary-opcode discrimination. PPCBUG-700. let key4_nd = (extract_bits(code, 22, 24) << 3) | extract_bits(code, 27, 27); match key4_nd { 0b000000 => return PpcOpcode::vcmpeqfp128, @@ -678,16 +685,6 @@ fn decode_op6(code: u32) -> PpcOpcode { 0b100000 => return PpcOpcode::vcmpequw128, _ => {} } - // Dot forms: bit 27 = 1, bit 25 = 0 (key = bits22-24 + bit25 + bit27, low 3 bits) - let key4_dt = (extract_bits(code, 22, 24) << 2) | (extract_bits(code, 25, 25) << 1) | extract_bits(code, 27, 27); - match key4_dt { - 0b00001 => return PpcOpcode::vcmpeqfp128, // bits22-24=000, bit25=0, bit27=1 - 0b00101 => return PpcOpcode::vcmpgefp128, // bits22-24=001, bit25=0, bit27=1 - 0b01001 => return PpcOpcode::vcmpgtfp128, // bits22-24=010, bit25=0, bit27=1 - 0b01101 => return PpcOpcode::vcmpbfp128, // bits22-24=011, bit25=0, bit27=1 - 0b10001 => return PpcOpcode::vcmpequw128, // bits22-24=100, bit25=0, bit27=1 - _ => {} - } // VMX128 shift/merge let key5 = (extract_bits(code, 22, 25) << 2) | extract_bits(code, 27, 27); @@ -1055,21 +1052,26 @@ mod tests { // used those extractors). Each test poke-bits exactly the slots the // accessor reads and asserts the assembled register number. - fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32, - vb16_20: u32, vb28: u32, vb30: u32) -> u32 { - // PPC bit i -> LSB position 31-i. - (vd6_10 << (31 - 10)) - | (va21 << (31 - 21)) // va128 high bit at PPC 29 in some forms — kept 0 here - | (vd21 << (31 - 21)) - | (vd22 << (31 - 22)) - | (vb16_20 << (31 - 20)) - | (vb28 << (31 - 28)) - | (vb30 << (31 - 30)) + /// Build a VMX128 test word for the canary-compliant register layout. + /// `vd128 = vd_lo | (vd_hi << 5)` where vd_lo is 5 bits (PPC 6-10) and + /// vd_hi is 2 bits (PPC 28-29). Same shape for vb128 (vb_lo at PPC 16-20, + /// vb_hi 2 bits at PPC 30-31). va128 = va_lo | (va_h26<<5) | (va_h21<<6) + /// per canary's 7-bit VA selector. + fn vmx128_test_word(vd_lo: u32, vd_hi: u32, va_lo: u32, va_h26: u32, va_h21: u32, + vb_lo: u32, vb_hi: u32) -> u32 { + // PPC bit i -> host bit (31-i). + (vd_lo << (31 - 10)) // VD128l: PPC 6-10 = host 21-25 + | (vd_hi << (31 - 29)) // VD128h: PPC 28-29 = host 2-3 (LSB at host 2) + | (va_lo << (31 - 15)) // VA128l: PPC 11-15 = host 16-20 + | (va_h26 << (31 - 26)) // VA128h: PPC 26 = host 5 + | (va_h21 << (31 - 21)) // VA128H: PPC 21 = host 10 + | (vb_lo << (31 - 20)) // VB128l: PPC 16-20 = host 11-15 + | (vb_hi << (31 - 31)) // VB128h: PPC 30-31 = host 0-1 (LSB at host 0) } #[test] fn vmx128_vd128_low_5_bits_only() { - // vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo + // vd_lo = 0..31, vd_hi = 0 → vd128 = vd_lo for r in 0..32u32 { let raw = (r as u32) << (31 - 10); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; @@ -1078,45 +1080,51 @@ mod tests { } #[test] - fn vmx128_vd128_bit21_adds_32() { - // vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32 - let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21)); + fn vmx128_vd128_high_low_bit_adds_32() { + // vd_lo = 0, VD128h = 0b01 (LSB only at host bit 2 = PPC 29) → vd128 = 32 + let raw = (1u32 << (31 - 29)); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.vd128(), 32); } #[test] - fn vmx128_vd128_bit22_adds_64() { - // vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64 - let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22)); + fn vmx128_vd128_high_high_bit_adds_64() { + // vd_lo = 0, VD128h = 0b10 (MSB only at host bit 3 = PPC 28) → vd128 = 64 + let raw = (1u32 << (31 - 28)); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.vd128(), 64); } #[test] fn vmx128_vd128_full_127() { - // vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127 + // vd_lo = 31, VD128h = 0b11 → vd128 = 127 let raw = (31u32 << (31 - 10)) - | (1u32 << (31 - 21)) - | (1u32 << (31 - 22)); + | (1u32 << (31 - 28)) + | (1u32 << (31 - 29)); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.vd128(), 127); } #[test] - fn vmx128_va128_uses_bit29() { - // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39. - let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29)); + fn vmx128_va128_canary_layout() { + // va_lo = 7 at PPC 11-15, VA128h = 1 at PPC 26 → va128 = 7 | 32 = 39 + let raw = (7u32 << (31 - 15)) | (1u32 << (31 - 26)); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.va128(), 39); + // VA128H = 1 at PPC 21 → va128 += 64 = 103 + let raw = raw | (1u32 << (31 - 21)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.va128(), 7 | 32 | 64); } #[test] - fn vmx128_vb128_uses_bits28_and_30() { - // vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64. - let raw = (5u32 << (31 - 20)) - | (1u32 << (31 - 28)) - | (1u32 << (31 - 30)); + fn vmx128_vb128_uses_bits30_31() { + // vb_lo = 5 at PPC 16-20. VB128h = 0b01 (LSB at PPC 31 = host 0) → +32. + // VB128h = 0b11 → +96. + let raw = (5u32 << (31 - 20)) | (1u32 << (31 - 31)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vb128(), 5 | 32); + let raw = raw | (1u32 << (31 - 30)); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.vb128(), 5 | 32 | 64); } @@ -1126,11 +1134,9 @@ mod tests { // vs128 must always equal vd128. for r in [0u32, 31, 32, 64, 96, 127] { let lo = r & 0x1F; - let b21 = (r >> 5) & 1; - let b22 = (r >> 6) & 1; + let hi = (r >> 5) & 0x3; let raw = (lo << (31 - 10)) - | (b21 << (31 - 21)) - | (b22 << (31 - 22)); + | (hi << (31 - 29)); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}"); assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}"); @@ -1142,7 +1148,8 @@ mod tests { #[allow(dead_code)] fn _vmx128_test_word_helper_compiles() { // Keep the helper validated against the real accessor. - let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0); + // vd_lo=5, vd_hi=0b11 → vd128 = 5 | 96 = 101 + let raw = vmx128_test_word(5, 3, 0, 0, 0, 0, 0); let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; assert_eq!(d.vd128(), 5 | 32 | 64); } diff --git a/crates/xenia-cpu/src/disasm.rs b/crates/xenia-cpu/src/disasm.rs index 6edbce1..352219c 100644 --- a/crates/xenia-cpu/src/disasm.rs +++ b/crates/xenia-cpu/src/disasm.rs @@ -361,7 +361,15 @@ pub fn format(instr: &DecodedInstr) -> DisasmText { PpcOpcode::dcbz => fmt_cache(instr, "dcbz"), PpcOpcode::dcbz128 => fmt_cache(instr, "dcbz128"), PpcOpcode::icbi => fmt_cache(instr, "icbi"), - PpcOpcode::sync => base("sync", String::new(), 0), + PpcOpcode::sync => { + // L-field at PPC bit 10 (host bit 21) selects lwsync (L=1), the + // acquire barrier in every Xbox 360 spinlock. PPCBUG-641. + if (instr.raw >> 21) & 1 == 1 { + with_ext("sync", String::new(), 0, "lwsync", String::new(), 0) + } else { + base("sync", String::new(), 0) + } + } PpcOpcode::eieio => base("eieio", String::new(), 0), PpcOpcode::isync => base("isync", String::new(), 0), @@ -828,7 +836,10 @@ fn fmt_bc(instr: &DecodedInstr) -> DisasmText { if decr { let z = if bo & 0x02 != 0 { "z" } else { "nz" }; - let cond_str = cond_name_opt.unwrap_or(""); + // BO bit 4 (uncond) means CR is ignored — pure CTR-decrement branch. + // Without this guard, bdnz/bdz would emit a spurious `ge` suffix derived + // from the don't-care BI=0 / cond_true=false pair (PPCBUG-640). + let cond_str = if uncond { "" } else { cond_name_opt.unwrap_or("") }; let ext_mnem = format!("bd{z}{cond_str}{a}{l}"); let ext_ops = format!("{cr}0x{target:08X}"); with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8) diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs index 3bb3d56..19fa865 100644 --- a/crates/xenia-cpu/src/interpreter.rs +++ b/crates/xenia-cpu/src/interpreter.rs @@ -846,7 +846,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - } let ctr_ok = (bo & 0b00100) != 0 - || ((ctx.ctr != 0) ^ ((bo & 0b00010) != 0)); + || (((ctx.ctr as u32) != 0) ^ ((bo & 0b00010) != 0)); let cond_ok = (bo & 0b10000) != 0 || (ctx.get_cr_bit(bi) == ((bo & 0b01000) != 0)); @@ -876,7 +876,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - } let ctr_ok = (bo & 0b00100) != 0 - || ((ctx.ctr != 0) ^ ((bo & 0b00010) != 0)); + || (((ctx.ctr as u32) != 0) ^ ((bo & 0b00010) != 0)); let cond_ok = (bo & 0b10000) != 0 || (ctx.get_cr_bit(bi) == ((bo & 0b01000) != 0)); @@ -1520,7 +1520,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - match spr { crate::context::spr::XER => ctx.set_xer(val as u32), crate::context::spr::LR => ctx.lr = val, - crate::context::spr::CTR => ctx.ctr = val, + crate::context::spr::CTR => ctx.ctr = val as u32 as u64, crate::context::spr::DEC => ctx.dec = val as u32, crate::context::spr::TBL_WRITE => { ctx.timebase = (ctx.timebase & 0xFFFF_FFFF_0000_0000) | (val & 0xFFFF_FFFF); @@ -1919,11 +1919,10 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - ctx.pc += 4; } PpcOpcode::vmaddfp128 => { - // VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator; - // Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through - // `InstrEmit_vmaddfp_` with arg order swapped so the resulting - // HIR computation is `VA * VB + VD`). Same unconditional denorm - // flush of all three inputs as scalar `vmaddfp`. + // ISA: (VD) <- (VA × VD) + VB. VD is both the second multiplicand and destination. + // Canary InstrEmit_vmaddfp128 (ppc_emit_altivec.cc:806-809): MulAdd(VA, VD, VB). + // Previous code computed ai.mul_add(bi, di) = VA×VB+VD — VB and VD roles swapped + // (PPCBUG-424). Fix: ai.mul_add(di, bi) = VA×VD+VB. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let d = ctx.vr[instr.vd128()].as_f32x4(); @@ -1932,7 +1931,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let di = vmx::flush_denorm(d[i]); - r[i] = ai.mul_add(bi, di); + r[i] = ai.mul_add(di, bi); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -2982,16 +2981,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - ctx.pc += 4; } PpcOpcode::stvewx128 => { - let ea = ea_indexed(ctx, instr) & !0xF; - // TODO PPCBUG-510: stvewx128 currently writes 16 bytes at ea & !0xF; the EA scope is - // wrong (should be word-aligned, 4 bytes only). When P3 fixes EA, this invalidate's - // range narrows automatically. - // PPCBUG-512: stvewx128 was missing invalidate_for_write. + // Mirror of stvewx: word-align EA, extract one 32-bit lane, write 4 bytes only. + // Previous code used & !0xF (16-byte) and wrote all 16 bytes, corrupting 12 + // adjacent bytes on every execution (PPCBUG-510). + let ea_unaligned = ea_indexed(ctx, instr); + let ea = ea_unaligned & !0x3u32; if let Some(t) = ctx.reservation_table.as_ref().filter(|t| t.is_enabled()) { if t.has_active_reservers() { t.invalidate_for_write(ea); } } + let slot = ((ea_unaligned & 0xF) >> 2) as usize; let bytes = ctx.vr[instr.vs128()].as_bytes(); - for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); } + let w = ((bytes[slot * 4] as u32) << 24) + | ((bytes[slot * 4 + 1] as u32) << 16) + | ((bytes[slot * 4 + 2] as u32) << 8) + | (bytes[slot * 4 + 3] as u32); + mem.write_u32(ea, w); ctx.pc += 4; } @@ -4292,11 +4296,11 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - // ═════════════════════════════════════════════════════════════════ // §4j — VMX128 FMA / permute // ═════════════════════════════════════════════════════════════════ - // vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator) + // vmaddcfp128: ISA (VD) <- (VA × VD) + VB — same operation as vmaddfp128 PpcOpcode::vmaddcfp128 => { - // Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both - // a source operand (as multiplicand) and the destination. Canary & - // POWER8 hardware confirm denormal inputs are flushed regardless of NJ. + // ISA: (VD) <- (VA × VD) + VB. Canary InstrEmit_vmaddcfp128 (cc:819): MulAdd(VA, VD, VB). + // Previous code computed di.mul_add(bi, ai) = VD×VB+VA — both operands wrong + // (PPCBUG-425). Fix: ai.mul_add(di, bi) = VA×VD+VB. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let d = ctx.vr[instr.vd128()].as_f32x4(); @@ -4305,7 +4309,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let di = vmx::flush_denorm(d[i]); - r[i] = di.mul_add(bi, ai); + r[i] = ai.mul_add(di, bi); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -5319,32 +5323,64 @@ mod tests { } /// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101) - /// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary + /// reuses vD as the accumulator: `vD <- (vA × vD) + vB`. Canary /// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs - /// unconditionally before the fused multiply-add — the 128-bit form - /// must match the scalar `vmaddfp` behaviour. Prior to this fix the - /// interpreter skipped the flush, leaving subnormal noise in math- - /// heavy game code. + /// unconditionally before the fused multiply-add. #[test] fn vmaddfp128_flushes_denormal_inputs() { let mut ctx = PpcContext::new(); let mut mem = TestMem::new(); let denorm = f32::from_bits(1); - // vA=v2 carries denorms, which is also vD's accumulator input. + // VA=v1, VD=v2, VB=v3 — all carry denormals. + ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); - // vB=v3 = 1.0 — denormal input survives only if not flushed. - ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]); - // vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared - // between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101. - let raw: u32 = 0x1440_18D0; + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); + // vmaddfp128 vD=v2, vA=v1, vB=v3: op6=5, vd_lo=2, va_lo=1, vb_lo=3, key2=0b001101. + // VA×VD+VB: all three flushed → 0*0+0 = 0. + let raw: u32 = (5u32 << 26) | (2 << 21) | (1 << 16) | (3 << 11) | (3 << 6) | (1 << 4); write_instr(&mut mem, 0, raw); ctx.pc = 0; step(&mut ctx, &mut mem); - // Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0. - // With flush: 0*0 + 0 = 0. assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]); } + // ---- PPCBUG-424+425: vmaddfp128/vmaddcfp128 operand swap ---- + // ISA for both: (VD) <- (VA × VD) + VB. Previous code computed VA×VB+VD and VD×VB+VA. + // Test uses distinct VA, VB, VD registers so the swap is visible. + // Encoding: op6=5, key2=0b001101 (vmaddfp128) / 0b010001 (vmaddcfp128). + // VA=v1=[2.0], VB=v2=[10.0], VD=v3=[3.0] → expected 2.0×3.0+10.0 = 16.0. + // Buggy vmaddfp128: 2.0×10.0+3.0 = 23.0. Buggy vmaddcfp128: 3.0×10.0+2.0 = 32.0. + + #[test] + fn vmaddfp128_operand_order_va_times_vd_plus_vb() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]); // VA=v1 + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2 + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]); // VD=v3 (also destination) + // vmaddfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b001101 (bits22-25=3, bit27=1) + let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (1 << 4); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16"); + } + + #[test] + fn vmaddcfp128_operand_order_va_times_vd_plus_vb() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.vr[1] = xenia_types::Vec128::from_f32x4_array([2.0f32; 4]); // VA=v1 + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([10.0f32; 4]); // VB=v2 + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([3.0f32; 4]); // VD=v3 + // vmaddcfp128 vD=v3, vA=v1, vB=v2 — op5, key2=0b010001 (bits22-25=4, bit27=1) + let raw: u32 = (5u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (4 << 6) | (1 << 4); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.vr[3].as_f32x4(), [16.0f32; 4], "VA×VD+VB = 2*3+10 = 16"); + } + /// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary /// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the /// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`. @@ -5798,6 +5834,59 @@ mod tests { assert_eq!(ctx.timebase >> 32, 0xAAAA_BBBB); } + // PPCBUG-053: bcx CTR zero-test must use 32-bit comparison. When prior + // 64-bit pollution (e.g. via negx → mtctr) leaves CTR upper 32 bits + // non-zero, the 64-bit `ctx.ctr != 0` would loop forever even when the + // 32-bit counter has decremented to zero. + #[test] + fn bcx_bdnz_uses_32bit_ctr_compare() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.ctr = 0x0000_0001_0000_0001; + // bdnz +8: BO=16 (decrement, branch if CTR!=0, ignore CR), BI=0, BD/4=2 + let raw = (16u32 << 26) | (16 << 21) | (0 << 16) | (2 << 2); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // After decrement: low 32 = 0, high 32 = 1. 32-bit test says zero → no branch. + assert_eq!(ctx.ctr, 0x0000_0001_0000_0000); + assert_eq!(ctx.pc, 4); + } + + #[test] + fn bclrx_uses_32bit_ctr_compare() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.ctr = 0x0000_0001_0000_0001; + ctx.lr = 0x100; + // bdnzlr: opcode 19, BO=16 (decrement, branch if CTR!=0), BI=0, XO=16 + let raw = (19u32 << 26) | (16 << 21) | (0 << 16) | (16 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // 32-bit CTR=0 after decrement → don't branch to LR. + assert_eq!(ctx.ctr, 0x0000_0001_0000_0000); + assert_eq!(ctx.pc, 4); + } + + // PPCBUG-054: mtspr CTR must truncate the source GPR to 32 bits, matching + // canary's `f.Truncate(ctr, INT32_TYPE)`. Prevents upstream 64-bit GPR + // pollution from poisoning the 32-bit CTR counter independently of the + // bcx zero-test fix. + #[test] + fn mtspr_ctr_truncates_to_32_bits() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = 0xFFFF_FFFF_8000_0001; + // mtspr CTR (9), r3 + let spr_swapped = ((9u32 & 0x1F) << 5) | ((9u32 >> 5) & 0x1F); + let raw = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (467 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.ctr, 0x8000_0001); + } + // ---------- Block-cache parity tests ---------- // // These confirm that running a program through the basic-block @@ -6395,13 +6484,25 @@ mod tests { // For vpkd3d128, PPC bits 21-22 are always 1 (key2 discriminant), so // vd128 is always in range [96, 127] for vd_lo in [0, 31]. - fn encode_vpkd3d128(vd_lo: u32, vb_lo: u32, imm: u32, z: u32) -> u32 { - // op6=6 (all VMX128 compute ops); VD[4:0] at host 25-21; IMM at host 20-16; - // VB[4:0] at host 15-11; host bits 10,9 = 1,1 (PPC bits 21,22, key2=0b110); - // z (2-bit) at host 7-6 (PPC bits 24-25); host bit 4 = 1 (PPC bit 27, key2 low=0b01). - // decode_op6 key2 = (bits21-23<<4)|(bits26-27) = (0b110<<4)|0b01 = 0b1100001. - (6u32 << 26) | (vd_lo << 21) | (imm << 16) | (vb_lo << 11) - | (1 << 10) | (1 << 9) | (z << 6) | (1 << 4) + fn encode_vpkd3d128(vd: u32, vb_lo: u32, imm: u32, z: u32) -> u32 { + // op6=6, FormatVX128_4 layout (canary): + // VD low at PPC 6-10 (host 21-25); VD high (2 bits) at PPC 28-29 (host 2-3). + // IMM at PPC 11-15; VB low at PPC 16-20. + // z (2-bit) at PPC 24-25 (host 6-7). + // key2 = 0b1100001 over bits 21-23 + 26-27: + // bits 21-23 = 0b110 → bit 21=1, bit 22=1, bit 23=0 + // bits 26-27 = 0b01 → bit 26=0, bit 27=1 + let vd_lo = vd & 0x1F; + let vd_hi = (vd >> 5) & 0x3; + (6u32 << 26) + | (vd_lo << 21) + | (vd_hi << 2) + | (imm << 16) + | (vb_lo << 11) + | (1 << 10) // bit 21 (key2) + | (1 << 9) // bit 22 (key2) + | (z << 6) // z at PPC 24-25 + | (1 << 4) // bit 27 (key2) } #[test] @@ -6415,7 +6516,7 @@ mod tests { ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // prev vd=96: sentinel values that should NOT appear in result ctx.vr[96] = xenia_types::Vec128::from_u32x4(0xAAAA_AAAA, 0xBBBB_BBBB, 0xCCCC_CCCC, 0xDDDD_DDDD); - write_instr(&mem, 0, encode_vpkd3d128(0, 1, 0, 0)); + write_instr(&mem, 0, encode_vpkd3d128(96, 1, 0, 0)); ctx.pc = 0; step(&mut ctx, &mem); let r = ctx.vr[96].as_u32x4(); @@ -6435,7 +6536,7 @@ mod tests { let mem = TestMem::new(); ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000 ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444); - write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 0)); + write_instr(&mem, 0, encode_vpkd3d128(96, 1, 1, 0)); ctx.pc = 0; step(&mut ctx, &mem); let r = ctx.vr[96].as_u32x4(); @@ -6454,7 +6555,7 @@ mod tests { let mem = TestMem::new(); ctx.vr[1] = xenia_types::Vec128::from_f32x4(1.0, 0.0, 0.0, 0.0); // out[3]=0x00FF_0000 ctx.vr[96] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444); - write_instr(&mem, 0, encode_vpkd3d128(0, 1, 1, 3)); + write_instr(&mem, 0, encode_vpkd3d128(96, 1, 1, 3)); ctx.pc = 0; step(&mut ctx, &mem); let r = ctx.vr[96].as_u32x4(); @@ -6463,4 +6564,47 @@ mod tests { assert_eq!(r[2], 0x3333_3333, "pack=1 shift=3: lane 2 from prev"); assert_eq!(r[3], 0x4444_4444, "pack=1 shift=3: lane 3 from prev"); } + + // ---- PPCBUG-510: stvewx128 should write one word (4 bytes), not 16 ---- + + fn encode_stvewx128(vs_lo: u32, ra: u32, rb: u32) -> u32 { + // stvewx128 is a VMX128 load/store at op6=4. + // decode_op4 key1 = (bits21-27 << 4) | bits30-31 = 0b00110000011 for stvewx128. + // bits21-27 = 0b0011000 (host bits 10-4), bits30-31 = 0b11 (host bits 1-0). + // VS128[4:0] at host bits 25-21; RA at host bits 20-16; RB at host bits 15-11. + // VS128[5] at host bit 3 (PPC bit 28); VS128[6] at host bit 1 (PPC bit 30). + (4u32 << 26) + | (vs_lo << 21) // VS128[4:0] + | (ra << 16) // RA + | (rb << 11) // RB + | (0b0011000 << 4) // bits 21-27 of key1 pattern + | 0b11 // bits 30-31 of key1 pattern + } + + #[test] + fn stvewx128_writes_one_word_at_word_aligned_ea() { + // PPCBUG-510: old code wrote all 16 bytes at ea & !0xF, corrupting 12 adjacent bytes. + // Fix: word-align EA, extract lane from (ea & 0xF) >> 2, write 4 bytes only. + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + // VS128 = v96 (vs_lo=0 | key bits → vs128=0 since key bits 21-27 set bit4=1 and bit5=1 + // in the key, but vs128 uses bits 6-10 for low 5 bits). + // Actually: vs128 uses decode bits 6-10 (host 25-21) and bits 21,22 (host 10,9). + // encode_stvewx128 sets vs_lo in bits 25-21 and key bits at bits 10-4. + // vs128 = bits6-10 | (bit21<<5) | (bit22<<6) = vs_lo | 0 | 0 = vs_lo. + // So vs128 = vs_lo. We'll use vs_lo=3 → vs128=3. + let raw = encode_stvewx128(3, 1, 2); + ctx.vr[3] = xenia_types::Vec128::from_u32x4(0x1111_1111, 0x2222_2222, 0x3333_3333, 0x4444_4444); + ctx.gpr[1] = 0x1000; // base + ctx.gpr[2] = 0x008; // offset → EA = 0x1008 → word-aligned EA = 0x1008, slot = (0x8 & 0xF)>>2 = 2 + write_instr(&mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mem); + assert_eq!(ctx.pc, 4, "PC must advance"); + // Slot 2 → lane 2 = 0x3333_3333 + assert_eq!(mem.read_u32(0x1008), 0x3333_3333, "only lane 2 word at ea"); + // Adjacent words must be untouched (mem is zero-init) + assert_eq!(mem.read_u32(0x1000), 0x0000_0000, "byte below must be untouched"); + assert_eq!(mem.read_u32(0x100C), 0x0000_0000, "byte above must be untouched"); + } } diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs index 97b3825..ec617a0 100644 --- a/crates/xenia-cpu/tests/disasm_goldens.rs +++ b/crates/xenia-cpu/tests/disasm_goldens.rs @@ -50,6 +50,23 @@ fn fixture_path(name: &str) -> PathBuf { .join(name) } +/// Encode a VMX128 VX128-form (or VX128_R/_2) instruction with canary's +/// 7-bit register layout: VD low at PPC 6-10, high 2 bits at PPC 28-29; +/// VA low at PPC 11-15, mid bit at PPC 26, high bit at PPC 21; VB low at +/// PPC 16-20, high 2 bits at PPC 30-31. `secondary_bits` carries any +/// secondary opcode + VC + Rc + key bits the caller needs. +fn encode_vx128(op6: u32, vd: u32, va: u32, vb: u32, secondary_bits: u32) -> u32 { + ((op6 & 0x3F) << 26) + | ((vd & 0x1F) << 21) + | (((vd >> 5) & 0x3) << 2) + | ((va & 0x1F) << 16) + | (((va >> 5) & 0x1) << 5) + | (((va >> 6) & 0x1) << 10) + | ((vb & 0x1F) << 11) + | (((vb >> 5) & 0x3) << 0) + | secondary_bits +} + fn build_rows(cases: &[(u32, u32, &str)]) -> Vec { cases .iter() @@ -428,77 +445,57 @@ fn vmx128_registers() { ((4u32 << 26) | (5 << 11) | 1604, 0x82000000, "mtvscr v5"), ]; - // VMX128 op=5 — uses vd128/va128/vb128 (7-bit registers, high bits at - // 21+22). These are the silent-bug-area encodings; we exercise low - // register indices here because the secondary-opcode key for op=5 - // includes bits 21-22, constraining vd128 high bits to 0 in this form. - // High-index examples for vd128 live in the op=6 series below. + // VMX128 op=5: vperm128 v3, v4, v5, vc=0. Canary FormatVX128: VD low + // at PPC 6-10, VA low at PPC 11-15, VB low at PPC 16-20, VC at PPC 23-25. + // key1 = (bit22<<5)|bit27 = 0 selects vperm128. let vmx128_op5 = [ - // vaddfp128 v3, v4, v5 : op=5, key2=0b000001 - ((5u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (0 << 6) | (1 << 0), 0x82000000, "vaddfp128 (encoded sloppily)"), + (encode_vx128(5, 3, 4, 5, 0), 0x82000000, "vperm128 v3, v4, v5, 0 (canary)"), ]; - // VMX128 op=6 — vrlimi128 has secondary key in bits 23-25 + 26-27, so - // bits 21-22 ARE the high bits of vd128 (canonical silent-bug-area). - // These instructions exercise vd128 = 32, 64, 96 — covering the bit-21 - // and bit-22 split that ppc.rs's old extractor (now deleted) miscoded. - let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 { - // op=6, vd128 = bits 6-10 + bit 21 + bit 22, vb128 = bits 16-20 + bits 30+31, - // IMM = bits 11-15, Z = bits 24-25, key2 = (bits 23-25 << 4) | bits 26-27 = 0b1110001 - let vd_lo = vd & 0x1F; - let vd_b21 = (vd >> 5) & 1; - let vd_b22 = (vd >> 6) & 1; - let vb_lo = vb & 0x1F; - let vb_b30 = (vb >> 5) & 1; - let vb_b31 = (vb >> 6) & 1; - // bits 23-25 = 111, bits 26-27 = 00, bit 27 = 1 → key2 lower 4 bits = 0001 - // Encoded: bits 23-25 = 111, bits 26-27 = 00 are actually overlapping with z field (bits 24-25) - // The plan view: (bits 23 << 6) | (bits 24-25 << 4) | (bits 26-27 << 2) but the table uses different. - // Easiest: hand-encode known bit pattern matching decoder.rs's match: - // key2 = (extract_bits(code, 23, 25) << 4) | extract_bits(code, 26, 27) = 0b1110001 - // bits 23-25 = 111, bits 26-27 = 01 - // Bit positions 23-27 = 11101 (5 bits, MSB at 23). - // PPC bit 23 (LSB index 8): set - // PPC bit 24 (LSB index 7): set -- this is z bit 0 - // PPC bit 25 (LSB index 6): set -- this is z bit 1 - // PPC bit 26 (LSB index 5): unset - // PPC bit 27 (LSB index 4): set - // We let z = bits 24-25 stored with vd128 bits at 21-22. - // To preserve key2 = 0b1110001, we need bits 24-25 = 11, bit 26 = 0, bit 27 = 1. - // BUT bits 24-25 ARE the z field; if we set them = 11 the z value is 3. - // So Z is constrained for vrlimi128. Choose Z = 3 (matches Sylpheed examples). - let z3 = z & 0x3; - (6u32 << 26) - | (vd_lo << 21) - | (imm << 16) - | (vb_lo << 11) - | (vd_b21 << 10) // bit 21 (LSB pos 10) - | (vd_b22 << 9) // bit 22 (LSB pos 9) - | (1 << 8) // bit 23 - | (z3 << 6) // bits 24-25 - | (0 << 5) // bit 26 - | (1 << 4) // bit 27 - | (vb_b30 << 1) // bit 30 - | vb_b31 // bit 31 + // VMX128 op=6 — exercise full 0-127 vd128 range under canary's layout. + // VD128h is at PPC 28-29 (host 2-3): no overlap with secondary opcode key, + // so vd can be freely 0-127 for any op6 instruction. + let vsrw128 = |vd: u32, vb: u32| -> u32 { + // vsrw128 secondary: 0x000001D0 (decode_op6 key5 = 0b011101). + encode_vx128(6, vd, 0, vb, 0x000001D0) + }; + let vpermwi128 = |vd: u32, vb: u32, perm: u32| -> u32 { + // vpermwi128: PERMl at PPC 11-15, PERMh at PPC 23-25, key1 sets bit 22 + bit 27. + let perml = perm & 0x1F; + let permh = (perm >> 5) & 0x7; + let mut raw = (6u32 << 26) + | ((vd & 0x1F) << 21) + | (((vd >> 5) & 0x3) << 2) // VD128h + | (perml << 16) + | ((vb & 0x1F) << 11) + | (((vb >> 5) & 0x3) << 0) // VB128h + | (permh << 6) // PERMh at PPC 23-25 + | (1 << 9) // bit 22 (key1 high) + | (1 << 4); // bit 27 (key1 low) + raw &= !(1 << 10); // PPC 21 = 0 for vpermwi128 + raw + }; + let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 { + // vrlimi128: IMM at PPC 11-15, z at PPC 24-25, key2 = 0b1110001 over + // bits 21-23 + 26-27 → bits 21,22,23 = 1, bit 26 = 0, bit 27 = 1. + (6u32 << 26) + | ((vd & 0x1F) << 21) + | (((vd >> 5) & 0x3) << 2) // VD128h + | ((imm & 0x1F) << 16) + | ((vb & 0x1F) << 11) + | (((vb >> 5) & 0x3) << 0) // VB128h + | ((z & 0x3) << 6) // z at PPC 24-25 = host 6-7 + | (1 << 8) // bit 23 (key2) + | (1 << 9) // bit 22 (key2) + | (1 << 10) // bit 21 (key2) + | (1 << 4) // bit 27 (key2) }; - // Note: VMX128 op6 secondary keys constrain bits 21-23. For - // vrlimi128 (key2 = 0b1110001 over bits 21-23 + 26-27) the only - // valid vd128 range is 96..=127 — lower values change the secondary - // key into some other instruction. The cases below record what the - // disassembler emits for the borderline encodings, so a regression - // in either the lookup table or the formatter would surface here. let vmx128_high = [ - // bits 21-22 = 00 → key2 ≠ vrlimi128 → decodes to vsrw128 (key5 - // branch). Locks current behavior; shows the silent-bug-area - // encoding constraint. - (vrlimi128(0, 12, 4, 3), 0x82000000, "encoding vd_hi=00: actually vsrw128"), - // bits 21-22 = 10 → still not vrlimi128. - (vrlimi128(32, 12, 4, 3), 0x82000000, "encoding vd_hi=10: actually vsrw128 v32"), - // bits 21-22 = 01 → key1 matches vpermwi128. - (vrlimi128(64, 12, 4, 3), 0x82000000, "encoding vd_hi=01: actually vpermwi128"), - // bits 21-22 = 11 → key2 matches vrlimi128 with vd128=96. - (vrlimi128(96, 12, 4, 3), 0x82000000, "vrlimi128 v96, v12, 4, 3 (real)"), - (vrlimi128(127, 127, 4, 3), 0x82000000, "vrlimi128 v127, v127, 4, 3 (real)"), + (vsrw128(0, 12), 0x82000000, "vsrw128 v0, v0, v12 (canary, vd_hi=00)"), + (vsrw128(32, 12), 0x82000000, "vsrw128 v32, v0, v12 (canary, VD128h=01)"), + (vpermwi128(64, 12, 0xE4), 0x82000000, "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)"), + (vrlimi128(96, 12, 4, 3), 0x82000000, "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)"), + (vrlimi128(127, 95, 4, 3), 0x82000000, "vrlimi128 v127, v95, 4, 3 (canary)"), ]; // Fix 4: VMX128 multiply-add 4-operand layouts. Per canary, the addend @@ -514,12 +511,11 @@ fn vmx128_registers() { // vmaddcfp128 VD, VA, VD, VB → "v3, v35, v3, v5" // vnmsubfp128 VD, VA, VD, VB → "v3, v35, v3, v5" let vmx128_4op = [ - // vmaddfp128: bits 24=1, 25=1, 27=1, bit 29=1 (VA high), VB=5 - (0x146028D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"), - // vmaddcfp128: bits 23=1, 27=1, bit 29=1, VB=5 - (0x14602914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"), - // vnmsubfp128: bits 23=1, 25=1, 27=1, bit 29=1, VB=5 - (0x14602954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"), + // Canary FormatVX128 layout: vd=3 (PPC 6-10), va=35 (low 3 at PPC 11-15 + VA128h=1 at PPC 26), + // vb=5 (PPC 16-20), key2 at PPC 22-25 + bit 27. + (0x146328F0u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"), + (0x14632930u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"), + (0x14632970u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"), ]; let mut all = Vec::new(); diff --git a/crates/xenia-cpu/tests/golden/extended_mnemonics.json b/crates/xenia-cpu/tests/golden/extended_mnemonics.json index efb251b..0c442dd 100644 --- a/crates/xenia-cpu/tests/golden/extended_mnemonics.json +++ b/crates/xenia-cpu/tests/golden/extended_mnemonics.json @@ -366,7 +366,7 @@ "addr": "0x82000000", "mnemonic": "bc", "operands": "16, lt, 0x82000040", - "ext_mnemonic": "bdnzge", + "ext_mnemonic": "bdnz", "ext_operands": "0x82000040", "branch_target": "0x82000040" }, @@ -376,7 +376,7 @@ "addr": "0x82000000", "mnemonic": "bc", "operands": "18, lt, 0x82000040", - "ext_mnemonic": "bdzge", + "ext_mnemonic": "bdz", "ext_operands": "0x82000040", "branch_target": "0x82000040" }, @@ -525,7 +525,9 @@ "raw": "0x7C2004AC", "addr": "0x82000000", "mnemonic": "sync", - "operands": "" + "operands": "", + "ext_mnemonic": "lwsync", + "ext_operands": "" }, { "label": "trap", diff --git a/crates/xenia-cpu/tests/golden/vmx128_registers.json b/crates/xenia-cpu/tests/golden/vmx128_registers.json index 1d072cd..12cf44d 100644 --- a/crates/xenia-cpu/tests/golden/vmx128_registers.json +++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json @@ -71,64 +71,64 @@ "operands": "v5" }, { - "label": "vaddfp128 (encoded sloppily)", - "raw": "0x14642801", + "label": "vperm128 v3, v4, v5, 0 (canary)", + "raw": "0x14642800", "addr": "0x82000000", "mnemonic": "vperm128", - "operands": "v3, v3, v5, 0" + "operands": "v3, v4, v5, 0" }, { - "label": "encoding vd_hi=00: actually vsrw128", - "raw": "0x180461D0", + "label": "vsrw128 v0, v0, v12 (canary, vd_hi=00)", + "raw": "0x180061D0", "addr": "0x82000000", "mnemonic": "vsrw128", "operands": "v0, v0, v12" }, { - "label": "encoding vd_hi=10: actually vsrw128 v32", - "raw": "0x180465D0", + "label": "vsrw128 v32, v0, v12 (canary, VD128h=01)", + "raw": "0x180061D4", "addr": "0x82000000", "mnemonic": "vsrw128", "operands": "v32, v0, v12" }, { - "label": "encoding vd_hi=01: actually vpermwi128", - "raw": "0x180463D0", + "label": "vpermwi128 v64, v12, 0xE4 (canary, VD128h=10)", + "raw": "0x180463D8", "addr": "0x82000000", "mnemonic": "vpermwi128", "operands": "v64, v12, 0xE4" }, { - "label": "vrlimi128 v96, v12, 4, 3 (real)", - "raw": "0x180467D0", + "label": "vrlimi128 v96, v12, 4, 3 (canary, VD128h=11)", + "raw": "0x180467DC", "addr": "0x82000000", "mnemonic": "vrlimi128", "operands": "v96, v12, 4, 3" }, { - "label": "vrlimi128 v127, v127, 4, 3 (real)", - "raw": "0x1BE4FFD3", + "label": "vrlimi128 v127, v95, 4, 3 (canary)", + "raw": "0x1BE4FFDE", "addr": "0x82000000", "mnemonic": "vrlimi128", "operands": "v127, v95, 4, 3" }, { "label": "vmaddfp128 v3, v35, v5, v3", - "raw": "0x146028D4", + "raw": "0x146328F0", "addr": "0x82000000", "mnemonic": "vmaddfp128", "operands": "v3, v35, v5, v3" }, { "label": "vmaddcfp128 v3, v35, v3, v5", - "raw": "0x14602914", + "raw": "0x14632930", "addr": "0x82000000", "mnemonic": "vmaddcfp128", "operands": "v3, v35, v3, v5" }, { "label": "vnmsubfp128 v3, v35, v3, v5", - "raw": "0x14602954", + "raw": "0x14632970", "addr": "0x82000000", "mnemonic": "vnmsubfp128", "operands": "v3, v35, v3, v5"