Merge branch 'ppc-audit-fix/p5-fpu' — Phase 5 FPU correctness

Phase 5 of the PPC instruction audit fix application: FPU correctness across the scalar FPU and VMX float arithmetic. ~22 PPCBUGs across 6 sub-sections (5a-5f). - f6a444b: 5a — round_to_i64 + vrfin round-to-even (PPCBUG-221+227, 432) - 26b9897: 5b — FMA VXISI + NaN sign preservation (PPCBUG-181/182/183/202/203/205) - 49bf74f: 5c — FPU XX-on-inexact for conversions (PPCBUG-223/224/225/229/230) - 538fa5a: 5d — VSCR.NJ subnormal flush for VMX float (PPCBUG-435/436/437) - 6ba8f83: 5e — fresx canary parity (PPCBUG-184) - 6fe2cbf: 5f — single-FMA vnmsubfp + vctsxs NaN saturation (PPCBUG-426/427/433) - 05f2f72: review-fix nit — vrfin uses stdlib round_ties_even Independent reviewer found no blocking issues; two minor follow-up items remain open for tracking. The vrfin nit was applied immediately in 05f2f72. Three substantive PPCBUGs were explicitly deferred — each requires substantial helper rework that's better landed as focused sub-batches: - PPCBUG-201: FPSCR.RN for double arithmetic (MXCSR set/restore wrappers) - PPCBUG-185: FPSCR.NI flush for scalar FPU (NI bit constant + post-op flush) - PPCBUG-180/200: XX/FR/FI in update_after_op (pre-vs-post-round comparison) These remain Status: open in audit-findings.md and will be picked up in a P5b sub-batch or P9 (test gaps) per planning. Verification at merge: cargo test --workspace --release reports 498 passed, 0 failed. Acid test deferred to end of all phases per user direction.
2026-05-02 12:38:18 +02:00
parent 5c45108249 05f2f72c71
commit d39d0bab4d
3 changed files with 224 additions and 37 deletions
--- a/crates/xenia-cpu/src/fpscr.rs
+++ b/crates/xenia-cpu/src/fpscr.rs
@@ -152,6 +152,33 @@ pub fn check_invalid_add(ctx: &mut PpcContext, a: f64, b: f64, sub: bool) -> boo
    false
 }
 /// FMA-aware add/sub VXISI check. Per PPCBUG-202+203: the previous code
 /// passed `a*c` as `lhs` to `check_invalid_add`, which suffers from two
 /// rounding errors and can spuriously raise/miss VXISI in extreme cases.
 /// This helper derives the mathematical product's sign and infinity status
 /// from the inputs directly.
 ///
 /// `sub` follows the same semantics as `check_invalid_add`:
 ///   - false (add): VXISI when product and b have opposite signs at infinity
 ///   - true  (sub): VXISI when product and b have same sign at infinity
 pub fn check_invalid_fma_add(ctx: &mut PpcContext, a: f64, c: f64, b: f64, sub: bool) -> bool {
    let mut bits = 0u32;
    if is_snan(a) || is_snan(c) || is_snan(b) { bits |= VXSNAN; }
    let product_is_inf = (a.is_infinite() || c.is_infinite())
        && a != 0.0 && c != 0.0
        && !a.is_nan() && !c.is_nan();
    if product_is_inf && b.is_infinite() {
        let p_neg = a.is_sign_negative() != c.is_sign_negative();
        let b_neg = b.is_sign_negative();
        let same_sign = p_neg == b_neg;
        if (sub && same_sign) || (!sub && !same_sign) {
            bits |= VXISI;
        }
    }
    if bits != 0 { set_exception(ctx, bits); return true; }
    false
 }
 pub fn check_invalid_mul(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
    let mut bits = 0u32;
    if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
@@ -220,15 +247,22 @@ pub fn round_to_single(ctx: &PpcContext, v: f64) -> f64 {
 pub fn round_to_i64(ctx: &PpcContext, v: f64) -> i64 {
    match rounding_mode(ctx) {
        RoundingMode::NearestEven => {
-            // Round-half-to-even (banker's rounding).
+            // PPCBUG-221: round-half-to-even (banker's rounding). The previous
-            let r = v.round();
+            // tie-detection used `(diff - 0.5).abs() < f64::EPSILON` which
-            // Rust's f64::round is round-half-away-from-zero. Correct ties to even:
+            // breaks for |v| > 2^52 (where v.trunc() == v exactly, giving diff
-            let diff = (v - v.trunc()).abs();
+            // == 0). Use a fractional-part-only check that's exact for
-            if (diff - 0.5).abs() < f64::EPSILON {
+            // |v| <= 2^52 and degenerates correctly above.
-                let floor = v.floor();
+            let t = v.trunc();
-                if (floor as i64) & 1 == 0 { floor as i64 } else { v.ceil() as i64 }
+            let frac = v - t;
            let fa = frac.abs();
            if fa > 0.5 {
                t as i64 + if v >= 0.0 { 1 } else { -1 }
            } else if fa < 0.5 {
                t as i64
            } else {
-                r as i64
+                // Exact 0.5 tie — round to even.
                let fi = t as i64;
                if fi & 1 == 0 { fi } else { fi + if v >= 0.0 { 1 } else { -1 } }
            }
        }
        RoundingMode::TowardZero => v.trunc() as i64,
@@ -355,11 +389,35 @@ mod tests {
    #[test]
    fn round_to_i64_nearest_even_on_tie() {
        let c = ctx();
        assert_eq!(round_to_i64(&c, 0.5_f64), 0);
        assert_eq!(round_to_i64(&c, 1.5_f64), 2);
        assert_eq!(round_to_i64(&c, 2.5_f64), 2);
        assert_eq!(round_to_i64(&c, 3.5_f64), 4);
        assert_eq!(round_to_i64(&c, -0.5_f64), 0);
        assert_eq!(round_to_i64(&c, -1.5_f64), -2);
        assert_eq!(round_to_i64(&c, -2.5_f64), -2);
    }
    #[test]
    fn round_to_i64_non_tie_cases() {
        // PPCBUG-221 regression: non-tie fractions must round to nearest.
        let c = ctx();
        assert_eq!(round_to_i64(&c, 0.4_f64), 0);
        assert_eq!(round_to_i64(&c, 0.6_f64), 1);
        assert_eq!(round_to_i64(&c, -0.4_f64), 0);
        assert_eq!(round_to_i64(&c, -0.6_f64), -1);
    }
    #[test]
    fn round_to_i32_nearest_even_on_tie() {
        // PPCBUG-227: round_to_i32 inherits round_to_i64's tie semantics.
        let c = ctx();
        assert_eq!(round_to_i32(&c, 0.5_f64), 0);
        assert_eq!(round_to_i32(&c, 1.5_f64), 2);
        assert_eq!(round_to_i32(&c, 2.5_f64), 2);
        assert_eq!(round_to_i32(&c, -1.5_f64), -2);
    }
    #[test]
    fn check_invalid_add_detects_inf_minus_inf() {
        let mut c = ctx();
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -1940,34 +1940,56 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
        // ===== VMX: Float Arithmetic =====
        PpcOpcode::vaddfp => {
            // PPCBUG-435: VSCR.NJ=1 (Xbox 360 always boots with this set) requires
            // flush-to-zero on subnormal inputs and outputs. Canary VMX float
            // arithmetic flushes denormals unconditionally.
            let a = ctx.vr[instr.ra()].as_f32x4();
            let b = ctx.vr[instr.rb()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai + bi);
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vaddfp128 => {
            // PPCBUG-435: same as vaddfp.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai + bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vsubfp => {
            // PPCBUG-435.
            let a = ctx.vr[instr.ra()].as_f32x4();
            let b = ctx.vr[instr.rb()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai - bi);
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vsubfp128 => {
            // PPCBUG-435.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai - bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
@@ -1982,7 +2004,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let ci = vmx::flush_denorm(c[i]);
-                r[i] = ai.mul_add(ci, bi);
+                // PPCBUG-437: flush subnormal output too.
                r[i] = vmx::flush_denorm(ai.mul_add(ci, bi));
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -2000,7 +2023,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437.
                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -2015,7 +2039,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let ci = vmx::flush_denorm(c[i]);
-                r[i] = bi - ai * ci;
+                // PPCBUG-426: single FMA rounding instead of two-step (b - a*c).
                r[i] = vmx::flush_denorm(-ai.mul_add(ci, -bi));
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -2032,16 +2057,22 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = di - ai * bi;
+                // PPCBUG-427: single FMA rounding.
                r[i] = vmx::flush_denorm(-ai.mul_add(bi, -di));
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vmulfp128 => {
            // PPCBUG-435 + PPCBUG-437.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] * b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai * bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
@@ -2398,11 +2429,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::vrfin | PpcOpcode::vrfin128 => {
            // PPCBUG-432: ISA round-to-nearest-even, NOT Rust's `round()`
            // (which is round-half-away-from-zero).
            let vb = if matches!(instr.opcode, PpcOpcode::vrfin128) { instr.vb128() } else { instr.rb() };
            let vd = if matches!(instr.opcode, PpcOpcode::vrfin128) { instr.vd128() } else { instr.rd() };
            let b = ctx.vr[vb].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = b[i].round(); }
+            for i in 0..4 { r[i] = b[i].round_ties_even(); }
            ctx.vr[vd] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
@@ -2559,11 +2592,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
        // ===== FPU: Multiply-Add =====
        PpcOpcode::fmaddx => {
            // PPCBUG-202: VXISI from input properties (not from `a*c` which has wrong sign on overflow).
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
-            fpscr::check_invalid_add(ctx, a * c, b, false);
+            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
            let result = a.mul_add(c, b);
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2571,10 +2605,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fmaddsx => {
            // PPCBUG-181: missing VXISI on add step.
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
            let result = to_single(ctx, a.mul_add(c, b));
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2582,10 +2618,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fmsubx => {
            // PPCBUG-203: missing VXISI on sub step.
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
            let result = a.mul_add(c, -b);
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2593,10 +2631,12 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fmsubsx => {
            // PPCBUG-182: missing VXISI on sub step.
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
            let result = to_single(ctx, a.mul_add(c, -b));
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
@@ -2604,44 +2644,58 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fnmaddx => {
            // PPCBUG-203: missing VXISI. PPCBUG-205: NaN sign preserved (no negation on NaN).
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
-            let result = -(a.mul_add(c, b));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
            let fma = a.mul_add(c, b);
            let result = if fma.is_nan() { fma } else { -fma };
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
            if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
            ctx.pc += 4;
        }
        PpcOpcode::fnmaddsx => {
            // PPCBUG-181 + PPCBUG-183: VXISI + NaN sign preservation.
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
-            let result = to_single(ctx, -(a.mul_add(c, b)));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, false);
            let fma = a.mul_add(c, b);
            let neg = if fma.is_nan() { fma } else { -fma };
            let result = to_single(ctx, neg);
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
            if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
            ctx.pc += 4;
        }
        PpcOpcode::fnmsubx => {
            // PPCBUG-203: VXISI. PPCBUG-205: NaN sign preservation.
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
-            let result = -(a.mul_add(c, -b));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
            let fma = a.mul_add(c, -b);
            let result = if fma.is_nan() { fma } else { -fma };
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
            if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
            ctx.pc += 4;
        }
        PpcOpcode::fnmsubsx => {
            // PPCBUG-182 + PPCBUG-183: VXISI + NaN sign preservation.
            let a = ctx.fpr[instr.ra()];
            let c = ctx.fpr[instr.rc()];
            let b = ctx.fpr[instr.rb()];
            fpscr::check_invalid_mul(ctx, a, c);
-            let result = to_single(ctx, -(a.mul_add(c, -b)));
+            fpscr::check_invalid_fma_add(ctx, a, c, b, true);
            let fma = a.mul_add(c, -b);
            let neg = if fma.is_nan() { fma } else { -fma };
            let result = to_single(ctx, neg);
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
            if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
@@ -2713,12 +2767,18 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fresx => {
-            // Single-precision reciprocal estimate: frD = 1.0 / frB
+            // Single-precision reciprocal estimate: frD = 1.0 / frB.
-            let b = ctx.fpr[instr.rb()];
+            // PPCBUG-184: pre-quantize input to f32 to match canary's
            // `f.Recip(f.Convert(frB, FLOAT32_TYPE))` behavior. Hardware
            // produces a ~12-bit LUT estimate; both emulators produce a
            // fully-IEEE single reciprocal, but the f32 quantization at
            // least makes the input precision match.
            let b_full = ctx.fpr[instr.rb()];
            let b = b_full as f32 as f64;
            if b == 0.0 {
                fpscr::set_exception(ctx, fpscr::ZX);
            }
-            if fpscr::is_snan(b) {
+            if fpscr::is_snan(b_full) {
                fpscr::set_exception(ctx, fpscr::VXSNAN);
            }
            let result = to_single(ctx, 1.0 / b);
@@ -2748,28 +2808,38 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
        // ===== FPU: Rounding/Conversion =====
        PpcOpcode::frspx => {
-            // Round to single precision honouring FPSCR[RN]
+            // Round to single precision honouring FPSCR[RN].
            // PPCBUG-225: set XX on inexact rounding (almost every frsp call).
            let b = ctx.fpr[instr.rb()];
            if fpscr::is_snan(b) {
                fpscr::set_exception(ctx, fpscr::VXSNAN);
            }
            let result = to_single(ctx, b);
            if b.is_finite() && result.is_finite() && result != b {
                fpscr::set_exception(ctx, fpscr::XX);
            }
            ctx.fpr[instr.rd()] = result;
            fpscr::update_after_op(ctx, result, b.is_finite());
            if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
            ctx.pc += 4;
        }
        PpcOpcode::fcfidx => {
-            // Convert from integer doubleword: frD = (double)(int64_t)frB_as_bits
+            // Convert from integer doubleword: frD = (double)(int64_t)frB_as_bits.
            // PPCBUG-224: set XX when |i64| > 2^53 (precision loss in conversion).
            let bits = ctx.fpr[instr.rb()].to_bits();
-            let result = (bits as i64) as f64;
+            let i = bits as i64;
            let result = i as f64;
            if (result as i64) != i {
                fpscr::set_exception(ctx, fpscr::XX);
            }
            ctx.fpr[instr.rd()] = result;
            fpscr::set_fprf(ctx, fpscr::classify_fprf(result));
            if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
            ctx.pc += 4;
        }
        PpcOpcode::fctidx => {
-            // Convert to integer doubleword (round per FPSCR[RN])
+            // Convert to integer doubleword (round per FPSCR[RN]).
            // PPCBUG-229: set XX on inexact (fractional input).
            let val = ctx.fpr[instr.rb()];
            let result = if val.is_nan() {
                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2781,6 +2851,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                fpscr::set_exception(ctx, fpscr::VXCVI);
                0x8000_0000_0000_0000u64
            } else {
                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                fpscr::round_to_i64(ctx, val) as u64
            };
            ctx.fpr[instr.rd()] = f64::from_bits(result);
@@ -2788,7 +2859,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fctidzx => {
-            // Convert to integer doubleword (round toward zero)
+            // Convert to integer doubleword (round toward zero).
            // PPCBUG-229: set XX on inexact.
            let val = ctx.fpr[instr.rb()];
            let result = if val.is_nan() {
                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2800,6 +2872,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                fpscr::set_exception(ctx, fpscr::VXCVI);
                0x8000_0000_0000_0000u64
            } else {
                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                (val.trunc() as i64) as u64
            };
            ctx.fpr[instr.rd()] = f64::from_bits(result);
@@ -2807,7 +2880,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fctiwx => {
-            // Convert to integer word (round per FPSCR[RN])
+            // Convert to integer word (round per FPSCR[RN]).
            // PPCBUG-230: set XX on inexact.
            let val = ctx.fpr[instr.rb()];
            let result_u32: u32 = if val.is_nan() {
                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2819,6 +2893,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                fpscr::set_exception(ctx, fpscr::VXCVI);
                0x8000_0000
            } else {
                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                fpscr::round_to_i32(ctx, val) as u32
            };
            ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64);
@@ -2826,7 +2901,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::fctiwzx => {
-            // Convert to integer word (round toward zero)
+            // Convert to integer word (round toward zero).
            // PPCBUG-230: set XX on inexact.
            let val = ctx.fpr[instr.rb()];
            let result_u32: u32 = if val.is_nan() {
                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
@@ -2838,6 +2914,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                fpscr::set_exception(ctx, fpscr::VXCVI);
                0x8000_0000
            } else {
                if val != val.trunc() { fpscr::set_exception(ctx, fpscr::XX); }
                val.trunc() as i32 as u32
            };
            ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64);
@@ -4378,7 +4455,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437: flush subnormal output too.
                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -4387,16 +4465,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
        // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal
        // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075.
        PpcOpcode::vmsum3fp128 => {
            // PPCBUG-436: flush per-product intermediates (not just the final sum).
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
            let p1 = vmx::flush_denorm(a[1] * b[1]);
            let p2 = vmx::flush_denorm(a[2] * b[2]);
            let s = vmx::flush_denorm(p0 + p1 + p2);
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
            ctx.pc += 4;
        }
        PpcOpcode::vmsum4fp128 => {
            // PPCBUG-436.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
            let p1 = vmx::flush_denorm(a[1] * b[1]);
            let p2 = vmx::flush_denorm(a[2] * b[2]);
            let p3 = vmx::flush_denorm(a[3] * b[3]);
            let s = vmx::flush_denorm(p0 + p1 + p2 + p3);
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
            ctx.pc += 4;
        }
@@ -5618,6 +5705,46 @@ mod tests {
    // ---------- Phase 2h: FPU / FPSCR ----------
    #[test]
    fn fmsub_inf_minus_inf_sets_vxisi() {
        // PPCBUG-203 regression: fmsub with a*c = +∞, -b = -∞ (b=+∞) →
        // +∞ + (-∞) → VXISI. Pre-fix had no add-step VXISI check.
        let mut ctx = PpcContext::new();
        let mut mem = TestMem::new();
        ctx.fpr[1] = f64::INFINITY;
        ctx.fpr[2] = f64::INFINITY;  // b
        ctx.fpr[3] = 1.0;
        // fmsub f4, f1, f3, f2 → 1*∞ - ∞ = VXISI
        // A-form: opcode=63, XO=28 (fmsub double): (63<<26)|(rd<<21)|(ra<<16)|(rb<<11)|(rc<<6)|(28<<1)
        let raw = (63u32 << 26) | (4 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (28 << 1);
        write_instr(&mut mem, 0, raw);
        ctx.pc = 0;
        step(&mut ctx, &mut mem);
        assert_ne!(ctx.fpscr & fpscr::VXISI, 0, "fmsub ∞-∞ must set VXISI");
    }
    #[test]
    fn fnmadd_nan_input_preserves_nan_sign() {
        // PPCBUG-205 regression: ISA forbids negating a NaN result.
        // a*c+b producing a NaN → result must be the NaN unchanged, not -NaN.
        let mut ctx = PpcContext::new();
        let mut mem = TestMem::new();
        let qnan = f64::NAN;
        ctx.fpr[1] = qnan;
        ctx.fpr[2] = 1.0;
        ctx.fpr[3] = 2.0;
        // fnmadd f4, f1, f3, f2  (XO=31)
        let raw = (63u32 << 26) | (4 << 21) | (1 << 16) | (2 << 11) | (3 << 6) | (31 << 1);
        write_instr(&mut mem, 0, raw);
        ctx.pc = 0;
        step(&mut ctx, &mut mem);
        // Result must be NaN with the same sign bit as the input NaN.
        let r = ctx.fpr[4];
        assert!(r.is_nan(), "result must be NaN");
        assert_eq!(r.is_sign_negative(), qnan.is_sign_negative(),
                   "fnmadd must preserve NaN sign (no negation on NaN)");
    }
    #[test]
    fn fadd_inf_minus_inf_sets_vxisi() {
        let mut ctx = PpcContext::new();
--- a/crates/xenia-cpu/src/vmx.rs
+++ b/crates/xenia-cpu/src/vmx.rs
@@ -214,7 +214,9 @@ pub fn flush_denorm(x: f32) -> f32 {
 //
 // vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
 #[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
-    if x.is_nan() { return (0, true); }
+    // PPCBUG-433: AltiVec ISA saturates NaN to INT_MIN (0x80000000), not 0.
    // (vctuxs's NaN→0 is correct per AltiVec ISA — see PPCBUG-434.)
    if x.is_nan() { return (i32::MIN, true); }
    let x = flush_denorm(x);
    let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
    if scaled >= i32::MAX as f64 { return (i32::MAX, true); }