From 538fa5ab74999d169836744556c25be615b36660 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sat, 2 May 2026 12:26:36 +0200 Subject: [PATCH] fix(cpu): PPCBUG-435/436/437 VSCR.NJ subnormal flush for VMX float MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5 batch 4 (5d) — partial: VSCR.NJ subnormal flush for VMX float arithmetic. Xbox 360 always boots with NJ=1, so games expect inputs and outputs flushed to ±0. - PPCBUG-435 vaddfp/vaddfp128/vsubfp/vsubfp128/vmulfp128: previously no flush at all on these opcodes (only vmaddfp family flushed). Now flushes both inputs and output per Canary's unconditional model. - PPCBUG-436 vmsum3fp128/vmsum4fp128: per-product intermediates now flushed individually (was only the final sum). - PPCBUG-437 vmaddfp/vmaddfp128/vmaddcfp128/vnmsubfp/vnmsubfp128: outputs now flushed (inputs were already flushed). PPCBUG-185 (FPSCR.NI flush for scalar FPU) deferred — requires adding a NI bit constant and post-op flush wrapper across all *sx arms; will land in a focused sub-batch. Co-Authored-By: Claude Sonnet 4.6 --- crates/xenia-cpu/src/interpreter.rs | 63 +++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs index 8239266..e4d88cd 100644 --- a/crates/xenia-cpu/src/interpreter.rs +++ b/crates/xenia-cpu/src/interpreter.rs @@ -1940,34 +1940,56 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - // ===== VMX: Float Arithmetic ===== PpcOpcode::vaddfp => { + // PPCBUG-435: VSCR.NJ=1 (Xbox 360 always boots with this set) requires + // flush-to-zero on subnormal inputs and outputs. Canary VMX float + // arithmetic flushes denormals unconditionally. let a = ctx.vr[instr.ra()].as_f32x4(); let b = ctx.vr[instr.rb()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = a[i] + b[i]; } + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + r[i] = vmx::flush_denorm(ai + bi); + } ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } PpcOpcode::vaddfp128 => { + // PPCBUG-435: same as vaddfp. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = a[i] + b[i]; } + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + r[i] = vmx::flush_denorm(ai + bi); + } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } PpcOpcode::vsubfp => { + // PPCBUG-435. let a = ctx.vr[instr.ra()].as_f32x4(); let b = ctx.vr[instr.rb()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = a[i] - b[i]; } + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + r[i] = vmx::flush_denorm(ai - bi); + } ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } PpcOpcode::vsubfp128 => { + // PPCBUG-435. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = a[i] - b[i]; } + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + r[i] = vmx::flush_denorm(ai - bi); + } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } @@ -1982,7 +2004,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let ci = vmx::flush_denorm(c[i]); - r[i] = ai.mul_add(ci, bi); + // PPCBUG-437: flush subnormal output too. + r[i] = vmx::flush_denorm(ai.mul_add(ci, bi)); } ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -2000,7 +2023,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let di = vmx::flush_denorm(d[i]); - r[i] = ai.mul_add(di, bi); + // PPCBUG-437. + r[i] = vmx::flush_denorm(ai.mul_add(di, bi)); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -2015,7 +2039,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let ci = vmx::flush_denorm(c[i]); - r[i] = bi - ai * ci; + r[i] = vmx::flush_denorm(bi - ai * ci); } ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -2032,16 +2056,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let di = vmx::flush_denorm(d[i]); - r[i] = di - ai * bi; + r[i] = vmx::flush_denorm(di - ai * bi); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } PpcOpcode::vmulfp128 => { + // PPCBUG-435 + PPCBUG-437. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = a[i] * b[i]; } + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + r[i] = vmx::flush_denorm(ai * bi); + } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } @@ -4430,7 +4459,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - let ai = vmx::flush_denorm(a[i]); let bi = vmx::flush_denorm(b[i]); let di = vmx::flush_denorm(d[i]); - r[i] = ai.mul_add(di, bi); + // PPCBUG-437: flush subnormal output too. + r[i] = vmx::flush_denorm(ai.mul_add(di, bi)); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; @@ -4439,16 +4469,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075. PpcOpcode::vmsum3fp128 => { + // PPCBUG-436: flush per-product intermediates (not just the final sum). let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); - let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]); + let p0 = vmx::flush_denorm(a[0] * b[0]); + let p1 = vmx::flush_denorm(a[1] * b[1]); + let p2 = vmx::flush_denorm(a[2] * b[2]); + let s = vmx::flush_denorm(p0 + p1 + p2); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s); ctx.pc += 4; } PpcOpcode::vmsum4fp128 => { + // PPCBUG-436. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); - let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]); + let p0 = vmx::flush_denorm(a[0] * b[0]); + let p1 = vmx::flush_denorm(a[1] * b[1]); + let p2 = vmx::flush_denorm(a[2] * b[2]); + let p3 = vmx::flush_denorm(a[3] * b[3]); + let s = vmx::flush_denorm(p0 + p1 + p2 + p3); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s); ctx.pc += 4; }