fix(cpu): PPCBUG-435/436/437 VSCR.NJ subnormal flush for VMX float

Phase 5 batch 4 (5d) — partial: VSCR.NJ subnormal flush for VMX float arithmetic. Xbox 360 always boots with NJ=1, so games expect inputs and outputs flushed to ±0. - PPCBUG-435 vaddfp/vaddfp128/vsubfp/vsubfp128/vmulfp128: previously no flush at all on these opcodes (only vmaddfp family flushed). Now flushes both inputs and output per Canary's unconditional model. - PPCBUG-436 vmsum3fp128/vmsum4fp128: per-product intermediates now flushed individually (was only the final sum). - PPCBUG-437 vmaddfp/vmaddfp128/vmaddcfp128/vnmsubfp/vnmsubfp128: outputs now flushed (inputs were already flushed). PPCBUG-185 (FPSCR.NI flush for scalar FPU) deferred — requires adding a NI bit constant and post-op flush wrapper across all *sx arms; will land in a focused sub-batch. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-02 12:26:36 +02:00
parent 49bf74fae6
commit 538fa5ab74
1 changed files with 51 additions and 12 deletions
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -1940,34 +1940,56 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
        // ===== VMX: Float Arithmetic =====
        PpcOpcode::vaddfp => {
            // PPCBUG-435: VSCR.NJ=1 (Xbox 360 always boots with this set) requires
            // flush-to-zero on subnormal inputs and outputs. Canary VMX float
            // arithmetic flushes denormals unconditionally.
            let a = ctx.vr[instr.ra()].as_f32x4();
            let b = ctx.vr[instr.rb()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai + bi);
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vaddfp128 => {
            // PPCBUG-435: same as vaddfp.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai + bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vsubfp => {
            // PPCBUG-435.
            let a = ctx.vr[instr.ra()].as_f32x4();
            let b = ctx.vr[instr.rb()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai - bi);
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vsubfp128 => {
            // PPCBUG-435.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai - bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
@@ -1982,7 +2004,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let ci = vmx::flush_denorm(c[i]);
-                r[i] = ai.mul_add(ci, bi);
+                // PPCBUG-437: flush subnormal output too.
                r[i] = vmx::flush_denorm(ai.mul_add(ci, bi));
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -2000,7 +2023,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437.
                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -2015,7 +2039,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let ci = vmx::flush_denorm(c[i]);
-                r[i] = bi - ai * ci;
+                r[i] = vmx::flush_denorm(bi - ai * ci);
            }
            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -2032,16 +2056,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = di - ai * bi;
+                r[i] = vmx::flush_denorm(di - ai * bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
        PpcOpcode::vmulfp128 => {
            // PPCBUG-435 + PPCBUG-437.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] * b[i]; }
+            for i in 0..4 {
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                r[i] = vmx::flush_denorm(ai * bi);
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
        }
@@ -4430,7 +4459,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                let ai = vmx::flush_denorm(a[i]);
                let bi = vmx::flush_denorm(b[i]);
                let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437: flush subnormal output too.
                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
            }
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
            ctx.pc += 4;
@@ -4439,16 +4469,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
        // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal
        // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075.
        PpcOpcode::vmsum3fp128 => {
            // PPCBUG-436: flush per-product intermediates (not just the final sum).
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
            let p1 = vmx::flush_denorm(a[1] * b[1]);
            let p2 = vmx::flush_denorm(a[2] * b[2]);
            let s = vmx::flush_denorm(p0 + p1 + p2);
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
            ctx.pc += 4;
        }
        PpcOpcode::vmsum4fp128 => {
            // PPCBUG-436.
            let a = ctx.vr[instr.va128()].as_f32x4();
            let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
            let p1 = vmx::flush_denorm(a[1] * b[1]);
            let p2 = vmx::flush_denorm(a[2] * b[2]);
            let p3 = vmx::flush_denorm(a[3] * b[3]);
            let s = vmx::flush_denorm(p0 + p1 + p2 + p3);
            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
            ctx.pc += 4;
        }