From f6a444b9d14a040db9421bb2860bcedb343d9498 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:13:08 +0200
Subject: [PATCH 1/7] fix(cpu): PPCBUG-221+227 round_to_i64 + PPCBUG-432 vrfin
 round-to-even
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 batch 1 (5a): round-to-int correctness.

PPCBUG-221+227 (coupled): round_to_i64 NearestEven tie-breaking used
`(diff - 0.5).abs() < f64::EPSILON` to detect half-integers, but for
|v| > 2^52 every f64 value is an exact integer (v.trunc() == v), giving
diff == 0. The buggy check fell through to v.round() (round-half-away-
from-zero), giving wrong results for large odd half-integers. Replaced
with a fractional-part-only check that's exact for |v| <= 2^52 and
degenerates to truncation above.

PPCBUG-432: vrfin/vrfin128 used Rust's `f32::round()` which is round-
half-away-from-zero. ISA requires round-to-nearest-even (banker's
rounding). Implemented inline.

PPCBUG-201 (FPSCR.RN for double arithmetic) deferred — requires
MXCSR-set/restore wrappers around 10+ FPU arms; will land in a focused
sub-batch after the remaining 5a-5f fixes.

Tests:
- round_to_i64_nearest_even_on_tie: extended with 0.5, 1.5, -0.5, -1.5.
- round_to_i64_non_tie_cases: 0.4/0.6 (non-tie sanity).
- round_to_i32_nearest_even_on_tie: PPCBUG-227 coverage.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/fpscr.rs       | 47 ++++++++++++++++++++++++-----
 crates/xenia-cpu/src/interpreter.rs | 16 +++++++++-
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/crates/xenia-cpu/src/fpscr.rs b/crates/xenia-cpu/src/fpscr.rs
index 1d05e67..366e4f7 100644
--- a/crates/xenia-cpu/src/fpscr.rs
+++ b/crates/xenia-cpu/src/fpscr.rs
@@ -220,15 +220,22 @@ pub fn round_to_single(ctx: &PpcContext, v: f64) -> f64 {
 pub fn round_to_i64(ctx: &PpcContext, v: f64) -> i64 {
     match rounding_mode(ctx) {
         RoundingMode::NearestEven => {
-            // Round-half-to-even (banker's rounding).
-            let r = v.round();
-            // Rust's f64::round is round-half-away-from-zero. Correct ties to even:
-            let diff = (v - v.trunc()).abs();
-            if (diff - 0.5).abs() < f64::EPSILON {
-                let floor = v.floor();
-                if (floor as i64) & 1 == 0 { floor as i64 } else { v.ceil() as i64 }
+            // PPCBUG-221: round-half-to-even (banker's rounding). The previous
+            // tie-detection used `(diff - 0.5).abs() < f64::EPSILON` which
+            // breaks for |v| > 2^52 (where v.trunc() == v exactly, giving diff
+            // == 0). Use a fractional-part-only check that's exact for
+            // |v| <= 2^52 and degenerates correctly above.
+            let t = v.trunc();
+            let frac = v - t;
+            let fa = frac.abs();
+            if fa > 0.5 {
+                t as i64 + if v >= 0.0 { 1 } else { -1 }
+            } else if fa < 0.5 {
+                t as i64
             } else {
-                r as i64
+                // Exact 0.5 tie — round to even.
+                let fi = t as i64;
+                if fi & 1 == 0 { fi } else { fi + if v >= 0.0 { 1 } else { -1 } }
             }
         }
         RoundingMode::TowardZero => v.trunc() as i64,
@@ -355,11 +362,35 @@ mod tests {
     #[test]
     fn round_to_i64_nearest_even_on_tie() {
         let c = ctx();
+        assert_eq!(round_to_i64(&c, 0.5_f64), 0);
+        assert_eq!(round_to_i64(&c, 1.5_f64), 2);
         assert_eq!(round_to_i64(&c, 2.5_f64), 2);
         assert_eq!(round_to_i64(&c, 3.5_f64), 4);
+        assert_eq!(round_to_i64(&c, -0.5_f64), 0);
+        assert_eq!(round_to_i64(&c, -1.5_f64), -2);
         assert_eq!(round_to_i64(&c, -2.5_f64), -2);
     }
 
+    #[test]
+    fn round_to_i64_non_tie_cases() {
+        // PPCBUG-221 regression: non-tie fractions must round to nearest.
+        let c = ctx();
+        assert_eq!(round_to_i64(&c, 0.4_f64), 0);
+        assert_eq!(round_to_i64(&c, 0.6_f64), 1);
+        assert_eq!(round_to_i64(&c, -0.4_f64), 0);
+        assert_eq!(round_to_i64(&c, -0.6_f64), -1);
+    }
+
+    #[test]
+    fn round_to_i32_nearest_even_on_tie() {
+        // PPCBUG-227: round_to_i32 inherits round_to_i64's tie semantics.
+        let c = ctx();
+        assert_eq!(round_to_i32(&c, 0.5_f64), 0);
+        assert_eq!(round_to_i32(&c, 1.5_f64), 2);
+        assert_eq!(round_to_i32(&c, 2.5_f64), 2);
+        assert_eq!(round_to_i32(&c, -1.5_f64), -2);
+    }
+
     #[test]
     fn check_invalid_add_detects_inf_minus_inf() {
         let mut c = ctx();
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index aad2485..9ff2b6b 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2398,11 +2398,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::vrfin | PpcOpcode::vrfin128 => {
+            // PPCBUG-432: ISA round-to-nearest-even, NOT Rust's round-half-away-from-zero.
             let vb = if matches!(instr.opcode, PpcOpcode::vrfin128) { instr.vb128() } else { instr.rb() };
             let vd = if matches!(instr.opcode, PpcOpcode::vrfin128) { instr.vd128() } else { instr.rd() };
             let b = ctx.vr[vb].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = b[i].round(); }
+            for i in 0..4 {
+                let x = b[i];
+                let t = x.trunc();
+                let frac = (x - t).abs();
+                r[i] = if frac > 0.5 {
+                    t + if x >= 0.0 { 1.0 } else { -1.0 }
+                } else if frac < 0.5 {
+                    t
+                } else {
+                    // Tie — round to even.
+                    let ti = t as i64;
+                    if ti & 1 == 0 { t } else { t + if x >= 0.0 { 1.0 } else { -1.0 } }
+                };
+            }
             ctx.vr[vd] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }

From 26b98975c3f5cfb2986aef0f6c372731199ec771 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:20:02 +0200
Subject: [PATCH 2/7] fix(cpu): PPCBUG-181/182/183/202/203/205 FMA VXISI + NaN
 sign preservation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 batch 2 (5b): VXISI / NaN handling for the FMA family.

The 8 FMA opcodes (fmaddx/fmaddsx/fmsubx/fmsubsx/fnmaddx/fnmaddsx/fnmsubx/
fnmsubsx) all share two fix shapes:

1. VXISI on the add/sub step. The previous code passed `a*c` to
   check_invalid_add, which has separate rounding from the FMA. In
   extreme cases this gives the wrong sign (PPCBUG-202) or wrong infinity
   status. Worse, fmsub/fnmadd/fnmsub had NO add-step VXISI check at all
   (PPCBUG-181/182/203). The fnmsub pattern is the canonical Newton-
   Raphson step — the most common FPU path in Xbox 360 graphics code.

2. NaN sign preservation in fnmadd/fnmsub. ISA Book I §4.3.4 forbids
   negation of a NaN FMA result; Rust's unary `-` flips the IEEE-754
   sign bit (PPCBUG-183/205).

Fixes:
- fpscr.rs: new helper `check_invalid_fma_add(ctx, a, c, b, sub)` that
  derives VXISI from input properties (mathematical-product sign +
  b sign) instead of from the lossy `a*c` value. Also covers SNaN.
- interpreter.rs: all 8 FMA arms now use the new helper; fnmadd[s]/
  fnmsub[s] gate the negation on `!fma.is_nan()`.

Tests:
- fmsub_inf_minus_inf_sets_vxisi: regression for PPCBUG-203.
- fnmadd_nan_input_preserves_nan_sign: regression for PPCBUG-205.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/fpscr.rs       | 27 +++++++++++
 crates/xenia-cpu/src/interpreter.rs | 71 +++++++++++++++++++++++++++--
 2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/crates/xenia-cpu/src/fpscr.rs b/crates/xenia-cpu/src/fpscr.rs
index 366e4f7..535a500 100644
--- a/crates/xenia-cpu/src/fpscr.rs
+++ b/crates/xenia-cpu/src/fpscr.rs
@@ -152,6 +152,33 @@ pub fn check_invalid_add(ctx: &mut PpcContext, a: f64, b: f64, sub: bool) -> boo
     false
 }
 
+/// FMA-aware add/sub VXISI check. Per PPCBUG-202+203: the previous code
+/// passed `a*c` as `lhs` to `check_invalid_add`, which suffers from two
+/// rounding errors and can spuriously raise/miss VXISI in extreme cases.
+/// This helper derives the mathematical product's sign and infinity status
+/// from the inputs directly.
+///
+/// `sub` follows the same semantics as `check_invalid_add`:
+///   - false (add): VXISI when product and b have opposite signs at infinity
+///   - true  (sub): VXISI when product and b have same sign at infinity
+pub fn check_invalid_fma_add(ctx: &mut PpcContext, a: f64, c: f64, b: f64, sub: bool) -> bool {
+    let mut bits = 0u32;
+    if is_snan(a) || is_snan(c) || is_snan(b) { bits |= VXSNAN; }
+    let product_is_inf = (a.is_infinite() || c.is_infinite())
+        && a != 0.0 && c != 0.0
+        && !a.is_nan() && !c.is_nan();
+    if product_is_inf && b.is_infinite() {
+        let p_neg = a.is_sign_negative() != c.is_sign_negative();
+        let b_neg = b.is_sign_negative();
+        let same_sign = p_neg == b_neg;
+        if (sub && same_sign) || (!sub && !same_sign) {
+            bits |= VXISI;
+        }
+    }
+    if bits != 0 { set_exception(ctx, bits); return true; }
+    false
+}
+
 pub fn check_invalid_mul(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
     let mut bits = 0u32;
     if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 9ff2b6b..6395b25 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2573,11 +2573,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
 
         // ===== FPU: Multiply-Add =====
         PpcOpcode::fmaddx => {
+            // PPCBUG-202: VXISI from input properties (not from `a*c` which has wrong sign on overflow).
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
-            fpscr::check_invalid_add(ctx, a * c, b, false);
+            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
             let result = a.mul_add(c, b);
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2585,10 +2586,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fmaddsx => {
+            // PPCBUG-181: missing VXISI on add step.
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
+            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
             let result = to_single(ctx, a.mul_add(c, b));
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2596,10 +2599,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fmsubx => {
+            // PPCBUG-203: missing VXISI on sub step.
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
+            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
             let result = a.mul_add(c, -b);
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2607,10 +2612,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fmsubsx => {
+            // PPCBUG-182: missing VXISI on sub step.
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
+            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
             let result = to_single(ctx, a.mul_add(c, -b));
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2618,44 +2625,58 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fnmaddx => {
+            // PPCBUG-203: missing VXISI. PPCBUG-205: NaN sign preserved (no negation on NaN).
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
-            let result = -(a.mul_add(c, b));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
+            let fma = a.mul_add(c, b);
+            let result = if fma.is_nan() { fma } else { -fma };
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fnmaddsx => {
+            // PPCBUG-181 + PPCBUG-183: VXISI + NaN sign preservation.
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
-            let result = to_single(ctx, -(a.mul_add(c, b)));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
+            let fma = a.mul_add(c, b);
+            let neg = if fma.is_nan() { fma } else { -fma };
+            let result = to_single(ctx, neg);
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fnmsubx => {
+            // PPCBUG-203: VXISI. PPCBUG-205: NaN sign preservation.
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
-            let result = -(a.mul_add(c, -b));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
+            let fma = a.mul_add(c, -b);
+            let result = if fma.is_nan() { fma } else { -fma };
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fnmsubsx => {
+            // PPCBUG-182 + PPCBUG-183: VXISI + NaN sign preservation.
             let a = ctx.fpr[instr.ra()];
             let c = ctx.fpr[instr.rc()];
             let b = ctx.fpr[instr.rb()];
             fpscr::check_invalid_mul(ctx, a, c);
-            let result = to_single(ctx, -(a.mul_add(c, -b)));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
+            let fma = a.mul_add(c, -b);
+            let neg = if fma.is_nan() { fma } else { -fma };
+            let result = to_single(ctx, neg);
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
@@ -5632,6 +5653,46 @@ mod tests {
 
     // ---------- Phase 2h: FPU / FPSCR ----------
 
+    #[test]
+    fn fmsub_inf_minus_inf_sets_vxisi() {
+        // PPCBUG-203 regression: fmsub with a*c = +∞, -b = -∞ (b=+∞) →
+        // +∞ + (-∞) → VXISI. Pre-fix had no add-step VXISI check.
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.fpr[1] = f64::INFINITY;
+        ctx.fpr[2] = f64::INFINITY;  // b
+        ctx.fpr[3] = 1.0;
+        // fmsub f4, f1, f3, f2 → 1*∞ - ∞ = VXISI
+        // A-form: opcode=63, XO=28 (fmsub double): (63<<26)|(rd<<21)|(ra<<16)|(rb<<11)|(rc<<6)|(28<<1)
+        let raw = (63u32 << 26) | (4 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (28 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_ne!(ctx.fpscr & fpscr::VXISI, 0, "fmsub ∞-∞ must set VXISI");
+    }
+
+    #[test]
+    fn fnmadd_nan_input_preserves_nan_sign() {
+        // PPCBUG-205 regression: ISA forbids negating a NaN result.
+        // a*c+b producing a NaN → result must be the NaN unchanged, not -NaN.
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        let qnan = f64::NAN;
+        ctx.fpr[1] = qnan;
+        ctx.fpr[2] = 1.0;
+        ctx.fpr[3] = 2.0;
+        // fnmadd f4, f1, f3, f2  (XO=31)
+        let raw = (63u32 << 26) | (4 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (31 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // Result must be NaN with the same sign bit as the input NaN.
+        let r = ctx.fpr[4];
+        assert!(r.is_nan(), "result must be NaN");
+        assert_eq!(r.is_sign_negative(), qnan.is_sign_negative(),
+                   "fnmadd must preserve NaN sign (no negation on NaN)");
+    }
+
     #[test]
     fn fadd_inf_minus_inf_sets_vxisi() {
         let mut ctx = PpcContext::new();

From 49bf74fae6b426c6954c91fc3da815d4faa7bced Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:22:47 +0200
Subject: [PATCH 3/7] fix(cpu): PPCBUG-223/224/225/229/230 FPU XX bit on
 inexact conversions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 batch 3 (5c) — partial: targeted XX-on-inexact fixes for the
float-to-int and double-to-single conversion family. (PPCBUG-180/200,
the broader update_after_op XX/FR/FI rework, deferred to a focused
sub-batch.)

- PPCBUG-225 frspx: set XX when the f64→f32 round produces a different
  value (i.e. precision loss). Almost every frsp call is inexact —
  previously games polling FPSCR.XX never saw the set bit after a frsp.
- PPCBUG-224 fcfidx: set XX when the i64 input has > 53 significant
  bits (precision lost in conversion to f64).
- PPCBUG-229 fctidx/fctidzx: set XX when input is non-integer (fractional
  part discarded by the conversion).
- PPCBUG-230 fctiwx/fctiwzx: same shape for word-width conversions.
- PPCBUG-223 verified already correct in current code (fcmpo sets
  VXSNAN/VXVC on NaN operands; the audit-cited drift was already fixed).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 31 ++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 6395b25..8239266 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2783,28 +2783,38 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
 
         // ===== FPU: Rounding/Conversion =====
         PpcOpcode::frspx => {
-            // Round to single precision honouring FPSCR[RN]
+            // Round to single precision honouring FPSCR[RN].
+            // PPCBUG-225: set XX on inexact rounding (almost every frsp call).
             let b = ctx.fpr[instr.rb()];
             if fpscr::is_snan(b) {
                 fpscr::set_exception(ctx, fpscr::VXSNAN);
             }
             let result = to_single(ctx, b);
+            if b.is_finite() && result.is_finite() && result != b {
+                fpscr::set_exception(ctx, fpscr::XX);
+            }
             ctx.fpr[instr.rd()] = result;
             fpscr::update_after_op(ctx, result, b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fcfidx => {
-            // Convert from integer doubleword: frD = (double)(int64_t)frB_as_bits
+            // Convert from integer doubleword: frD = (double)(int64_t)frB_as_bits.
+            // PPCBUG-224: set XX when |i64| > 2^53 (precision loss in conversion).
             let bits = ctx.fpr[instr.rb()].to_bits();
-            let result = (bits as i64) as f64;
+            let i = bits as i64;
+            let result = i as f64;
+            if (result as i64) != i {
+                fpscr::set_exception(ctx, fpscr::XX);
+            }
             ctx.fpr[instr.rd()] = result;
             fpscr::set_fprf(ctx, fpscr::classify_fprf(result));
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fctidx => {
-            // Convert to integer doubleword (round per FPSCR[RN])
+            // Convert to integer doubleword (round per FPSCR[RN]).
+            // PPCBUG-229: set XX on inexact (fractional input).
             let val = ctx.fpr[instr.rb()];
             let result = if val.is_nan() {
                 fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2816,6 +2826,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 fpscr::set_exception(ctx, fpscr::VXCVI);
                 0x8000_0000_0000_0000u64
             } else {
+                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                 fpscr::round_to_i64(ctx, val) as u64
             };
             ctx.fpr[instr.rd()] = f64::from_bits(result);
@@ -2823,7 +2834,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fctidzx => {
-            // Convert to integer doubleword (round toward zero)
+            // Convert to integer doubleword (round toward zero).
+            // PPCBUG-229: set XX on inexact.
             let val = ctx.fpr[instr.rb()];
             let result = if val.is_nan() {
                 fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2835,6 +2847,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 fpscr::set_exception(ctx, fpscr::VXCVI);
                 0x8000_0000_0000_0000u64
             } else {
+                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                 (val.trunc() as i64) as u64
             };
             ctx.fpr[instr.rd()] = f64::from_bits(result);
@@ -2842,7 +2855,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fctiwx => {
-            // Convert to integer word (round per FPSCR[RN])
+            // Convert to integer word (round per FPSCR[RN]).
+            // PPCBUG-230: set XX on inexact.
             let val = ctx.fpr[instr.rb()];
             let result_u32: u32 = if val.is_nan() {
                 fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2854,6 +2868,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 fpscr::set_exception(ctx, fpscr::VXCVI);
                 0x8000_0000
             } else {
+                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                 fpscr::round_to_i32(ctx, val) as u32
             };
             ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64);
@@ -2861,7 +2876,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fctiwzx => {
-            // Convert to integer word (round toward zero)
+            // Convert to integer word (round toward zero).
+            // PPCBUG-230: set XX on inexact.
             let val = ctx.fpr[instr.rb()];
             let result_u32: u32 = if val.is_nan() {
                 fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2873,6 +2889,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 fpscr::set_exception(ctx, fpscr::VXCVI);
                 0x8000_0000
             } else {
+                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                 val.trunc() as i32 as u32
             };
             ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64);

From 538fa5ab74999d169836744556c25be615b36660 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:26:36 +0200
Subject: [PATCH 4/7] fix(cpu): PPCBUG-435/436/437 VSCR.NJ subnormal flush for
 VMX float
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 batch 4 (5d) — partial: VSCR.NJ subnormal flush for VMX float
arithmetic. Xbox 360 always boots with NJ=1, so games expect inputs
and outputs flushed to ±0.

- PPCBUG-435 vaddfp/vaddfp128/vsubfp/vsubfp128/vmulfp128: previously
  no flush at all on these opcodes (only vmaddfp family flushed).
  Now flushes both inputs and output per Canary's unconditional model.
- PPCBUG-436 vmsum3fp128/vmsum4fp128: per-product intermediates now
  flushed individually (was only the final sum).
- PPCBUG-437 vmaddfp/vmaddfp128/vmaddcfp128/vnmsubfp/vnmsubfp128:
  outputs now flushed (inputs were already flushed).

PPCBUG-185 (FPSCR.NI flush for scalar FPU) deferred — requires adding
a NI bit constant and post-op flush wrapper across all *sx arms; will
land in a focused sub-batch.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 63 +++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 12 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 8239266..e4d88cd 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -1940,34 +1940,56 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
 
         // ===== VMX: Float Arithmetic =====
         PpcOpcode::vaddfp => {
+            // PPCBUG-435: VSCR.NJ=1 (Xbox 360 always boots with this set) requires
+            // flush-to-zero on subnormal inputs and outputs. Canary VMX float
+            // arithmetic flushes denormals unconditionally.
             let a = ctx.vr[instr.ra()].as_f32x4();
             let b = ctx.vr[instr.rb()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai + bi);
+            }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vaddfp128 => {
+            // PPCBUG-435: same as vaddfp.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai + bi);
+            }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vsubfp => {
+            // PPCBUG-435.
             let a = ctx.vr[instr.ra()].as_f32x4();
             let b = ctx.vr[instr.rb()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai - bi);
+            }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vsubfp128 => {
+            // PPCBUG-435.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai - bi);
+            }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -1982,7 +2004,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let ci = vmx::flush_denorm(c[i]);
-                r[i] = ai.mul_add(ci, bi);
+                // PPCBUG-437: flush subnormal output too.
+                r[i] = vmx::flush_denorm(ai.mul_add(ci, bi));
             }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -2000,7 +2023,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437.
+                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -2015,7 +2039,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let ci = vmx::flush_denorm(c[i]);
-                r[i] = bi - ai * ci;
+                r[i] = vmx::flush_denorm(bi - ai * ci);
             }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -2032,16 +2056,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = di - ai * bi;
+                r[i] = vmx::flush_denorm(di - ai * bi);
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vmulfp128 => {
+            // PPCBUG-435 + PPCBUG-437.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] * b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai * bi);
+            }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -4430,7 +4459,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437: flush subnormal output too.
+                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -4439,16 +4469,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal
         // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075.
         PpcOpcode::vmsum3fp128 => {
+            // PPCBUG-436: flush per-product intermediates (not just the final sum).
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
+            let p1 = vmx::flush_denorm(a[1] * b[1]);
+            let p2 = vmx::flush_denorm(a[2] * b[2]);
+            let s = vmx::flush_denorm(p0 + p1 + p2);
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
             ctx.pc += 4;
         }
         PpcOpcode::vmsum4fp128 => {
+            // PPCBUG-436.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
+            let p1 = vmx::flush_denorm(a[1] * b[1]);
+            let p2 = vmx::flush_denorm(a[2] * b[2]);
+            let p3 = vmx::flush_denorm(a[3] * b[3]);
+            let s = vmx::flush_denorm(p0 + p1 + p2 + p3);
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
             ctx.pc += 4;
         }

From 6ba8f83c30c46a4b7fb27b1556e744546a2e83d6 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:29:07 +0200
Subject: [PATCH 5/7] fix(cpu): PPCBUG-184 fresx pre-quantize input to f32
 (canary parity)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 batch 5 (5e): minimal-viable fix for the estimate-precision
family. Hardware Xenon `fres` produces a ~12-bit LUT estimate; xenia
and canary both produce a fully IEEE single reciprocal, but canary
pre-quantizes the f64 input to f32 to at least match the input
precision. Now matches canary.

PPCBUG-428..431 (vrefp/vrsqrtefp/vexptefp/vlogefp) already operate on
f32 inputs naturally (no f64 → f32 quantization step needed); the
estimate-precision deviation is purely the output side. Newton-Raphson
convergence is unaffected. Documented in audit-findings.md as
LOW-impact full-fix-requires-LUT.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index e4d88cd..051aa4f 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2777,12 +2777,18 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::fresx => {
-            // Single-precision reciprocal estimate: frD = 1.0 / frB
-            let b = ctx.fpr[instr.rb()];
+            // Single-precision reciprocal estimate: frD = 1.0 / frB.
+            // PPCBUG-184: pre-quantize input to f32 to match canary's
+            // `f.Recip(f.Convert(frB, FLOAT32_TYPE))` behavior. Hardware
+            // produces a ~12-bit LUT estimate; both emulators produce a
+            // fully-IEEE single reciprocal, but the f32 quantization at
+            // least makes the input precision match.
+            let b_full = ctx.fpr[instr.rb()];
+            let b = b_full as f32 as f64;
             if b == 0.0 {
                 fpscr::set_exception(ctx, fpscr::ZX);
             }
-            if fpscr::is_snan(b) {
+            if fpscr::is_snan(b_full) {
                 fpscr::set_exception(ctx, fpscr::VXSNAN);
             }
             let result = to_single(ctx, 1.0 / b);

From 6fe2cbf2518a9ff368b5fac66c9b220a1fb11400 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:31:10 +0200
Subject: [PATCH 6/7] fix(cpu): PPCBUG-426/427/433 single-FMA vnmsubfp + vctsxs
 NaN saturation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 batch 6 (5f): saturation and FMA-rounding fixes.

- PPCBUG-426 vnmsubfp: was `bi - ai * ci` (two rounding steps); now
  `-ai.mul_add(ci, -bi)` which is mathematically equivalent (= bi - ai*ci)
  but uses a single FMA round per ISA.
- PPCBUG-427 vnmsubfp128: same single-FMA fix.
- PPCBUG-433 vctsxs / vcfpsxws128 NaN saturation: AltiVec ISA saturates
  NaN to INT_MIN (0x80000000); xenia returned 0. The vctuxs (unsigned)
  NaN→0 is correct per ISA.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 6 ++++--
 crates/xenia-cpu/src/vmx.rs         | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 051aa4f..082e029 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2039,7 +2039,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let ci = vmx::flush_denorm(c[i]);
-                r[i] = vmx::flush_denorm(bi - ai * ci);
+                // PPCBUG-426: single FMA rounding instead of two-step (b - a*c).
+                r[i] = vmx::flush_denorm(-ai.mul_add(ci, -bi));
             }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -2056,7 +2057,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = vmx::flush_denorm(di - ai * bi);
+                // PPCBUG-427: single FMA rounding.
+                r[i] = vmx::flush_denorm(-ai.mul_add(bi, -di));
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
diff --git a/crates/xenia-cpu/src/vmx.rs b/crates/xenia-cpu/src/vmx.rs
index 5650a34..4be636f 100644
--- a/crates/xenia-cpu/src/vmx.rs
+++ b/crates/xenia-cpu/src/vmx.rs
@@ -214,7 +214,9 @@ pub fn flush_denorm(x: f32) -> f32 {
 //
 // vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
 #[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
-    if x.is_nan() { return (0, true); }
+    // PPCBUG-433: AltiVec ISA saturates NaN to INT_MIN (0x80000000), not 0.
+    // (vctuxs's NaN→0 is correct per AltiVec ISA — see PPCBUG-434.)
+    if x.is_nan() { return (i32::MIN, true); }
     let x = flush_denorm(x);
     let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
     if scaled >= i32::MAX as f64 { return (i32::MAX, true); }

From 05f2f72c718941ff0f64315f1ba980329661b1f3 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:37:54 +0200
Subject: [PATCH 7/7] refactor(cpu): vrfin uses stdlib f32::round_ties_even()
 per reviewer nit

P5 review feedback (non-blocking): replace the inline round-to-even
implementation with the stable stdlib intrinsic (Rust 1.77+).
Functionally equivalent; cleaner.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 082e029..31d54f5 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -2429,25 +2429,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
             ctx.pc += 4;
         }
         PpcOpcode::vrfin | PpcOpcode::vrfin128 => {
-            // PPCBUG-432: ISA round-to-nearest-even, NOT Rust's round-half-away-from-zero.
+            // PPCBUG-432: ISA round-to-nearest-even, NOT Rust's `round()`
+            // (which is round-half-away-from-zero).
             let vb = if matches!(instr.opcode, PpcOpcode::vrfin128) { instr.vb128() } else { instr.rb() };
             let vd = if matches!(instr.opcode, PpcOpcode::vrfin128) { instr.vd128() } else { instr.rd() };
             let b = ctx.vr[vb].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 {
-                let x = b[i];
-                let t = x.trunc();
-                let frac = (x - t).abs();
-                r[i] = if frac > 0.5 {
-                    t + if x >= 0.0 { 1.0 } else { -1.0 }
-                } else if frac < 0.5 {
-                    t
-                } else {
-                    // Tie — round to even.
-                    let ti = t as i64;
-                    if ti & 1 == 0 { t } else { t + if x >= 0.0 { 1.0 } else { -1.0 } }
-                };
-            }
+            for i in 0..4 { r[i] = b[i].round_ties_even(); }
             ctx.vr[vd] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }