From 538fa5ab74999d169836744556c25be615b36660 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sat, 2 May 2026 12:26:36 +0200
Subject: [PATCH] fix(cpu): PPCBUG-435/436/437 VSCR.NJ subnormal flush for VMX
 float
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5 batch 4 (5d) — partial: VSCR.NJ subnormal flush for VMX float
arithmetic. Xbox 360 always boots with NJ=1, so games expect inputs
and outputs flushed to ±0.

- PPCBUG-435 vaddfp/vaddfp128/vsubfp/vsubfp128/vmulfp128: previously
  no flush at all on these opcodes (only vmaddfp family flushed).
  Now flushes both inputs and output per Canary's unconditional model.
- PPCBUG-436 vmsum3fp128/vmsum4fp128: per-product intermediates now
  flushed individually (was only the final sum).
- PPCBUG-437 vmaddfp/vmaddfp128/vmaddcfp128/vnmsubfp/vnmsubfp128:
  outputs now flushed (inputs were already flushed).

PPCBUG-185 (FPSCR.NI flush for scalar FPU) deferred — requires adding
a NI bit constant and post-op flush wrapper across all *sx arms; will
land in a focused sub-batch.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/xenia-cpu/src/interpreter.rs | 63 +++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 12 deletions(-)

diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 8239266..e4d88cd 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -1940,34 +1940,56 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
 
         // ===== VMX: Float Arithmetic =====
         PpcOpcode::vaddfp => {
+            // PPCBUG-435: VSCR.NJ=1 (Xbox 360 always boots with this set) requires
+            // flush-to-zero on subnormal inputs and outputs. Canary VMX float
+            // arithmetic flushes denormals unconditionally.
             let a = ctx.vr[instr.ra()].as_f32x4();
             let b = ctx.vr[instr.rb()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai + bi);
+            }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vaddfp128 => {
+            // PPCBUG-435: same as vaddfp.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] + b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai + bi);
+            }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vsubfp => {
+            // PPCBUG-435.
             let a = ctx.vr[instr.ra()].as_f32x4();
             let b = ctx.vr[instr.rb()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai - bi);
+            }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vsubfp128 => {
+            // PPCBUG-435.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] - b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai - bi);
+            }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -1982,7 +2004,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let ci = vmx::flush_denorm(c[i]);
-                r[i] = ai.mul_add(ci, bi);
+                // PPCBUG-437: flush subnormal output too.
+                r[i] = vmx::flush_denorm(ai.mul_add(ci, bi));
             }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -2000,7 +2023,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437.
+                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -2015,7 +2039,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let ci = vmx::flush_denorm(c[i]);
-                r[i] = bi - ai * ci;
+                r[i] = vmx::flush_denorm(bi - ai * ci);
             }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -2032,16 +2056,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = di - ai * bi;
+                r[i] = vmx::flush_denorm(di - ai * bi);
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vmulfp128 => {
+            // PPCBUG-435 + PPCBUG-437.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i] * b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                r[i] = vmx::flush_denorm(ai * bi);
+            }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -4430,7 +4459,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
                 let ai = vmx::flush_denorm(a[i]);
                 let bi = vmx::flush_denorm(b[i]);
                 let di = vmx::flush_denorm(d[i]);
-                r[i] = ai.mul_add(di, bi);
+                // PPCBUG-437: flush subnormal output too.
+                r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
             }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
@@ -4439,16 +4469,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
         // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal
         // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075.
         PpcOpcode::vmsum3fp128 => {
+            // PPCBUG-436: flush per-product intermediates (not just the final sum).
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
+            let p1 = vmx::flush_denorm(a[1] * b[1]);
+            let p2 = vmx::flush_denorm(a[2] * b[2]);
+            let s = vmx::flush_denorm(p0 + p1 + p2);
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
             ctx.pc += 4;
         }
         PpcOpcode::vmsum4fp128 => {
+            // PPCBUG-436.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
-            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]);
+            let p0 = vmx::flush_denorm(a[0] * b[0]);
+            let p1 = vmx::flush_denorm(a[1] * b[1]);
+            let p2 = vmx::flush_denorm(a[2] * b[2]);
+            let p3 = vmx::flush_denorm(a[3] * b[3]);
+            let s = vmx::flush_denorm(p0 + p1 + p2 + p3);
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
             ctx.pc += 4;
         }