fix(cpu): PPCBUG-435/436/437 VSCR.NJ subnormal flush for VMX float

Phase 5 batch 4 (5d) — partial: VSCR.NJ subnormal flush for VMX float
arithmetic. Xbox 360 always boots with NJ=1, so games expect inputs
and outputs flushed to ±0.

- PPCBUG-435 vaddfp/vaddfp128/vsubfp/vsubfp128/vmulfp128: previously
  no flush at all on these opcodes (only vmaddfp family flushed).
  Now flushes both inputs and output per Canary's unconditional model.
- PPCBUG-436 vmsum3fp128/vmsum4fp128: per-product intermediates now
  flushed individually (was only the final sum).
- PPCBUG-437 vmaddfp/vmaddfp128/vmaddcfp128/vnmsubfp/vnmsubfp128:
  outputs now flushed (inputs were already flushed).

PPCBUG-185 (FPSCR.NI flush for scalar FPU) deferred — requires adding
a NI bit constant and post-op flush wrapper across all *sx arms; will
land in a focused sub-batch.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-02 12:26:36 +02:00
parent 49bf74fae6
commit 538fa5ab74

View File

@@ -1940,34 +1940,56 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
// ===== VMX: Float Arithmetic ===== // ===== VMX: Float Arithmetic =====
PpcOpcode::vaddfp => { PpcOpcode::vaddfp => {
// PPCBUG-435: VSCR.NJ=1 (Xbox 360 always boots with this set) requires
// flush-to-zero on subnormal inputs and outputs. Canary VMX float
// arithmetic flushes denormals unconditionally.
let a = ctx.vr[instr.ra()].as_f32x4(); let a = ctx.vr[instr.ra()].as_f32x4();
let b = ctx.vr[instr.rb()].as_f32x4(); let b = ctx.vr[instr.rb()].as_f32x4();
let mut r = [0f32; 4]; let mut r = [0f32; 4];
for i in 0..4 { r[i] = a[i] + b[i]; } for i in 0..4 {
let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]);
r[i] = vmx::flush_denorm(ai + bi);
}
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
} }
PpcOpcode::vaddfp128 => { PpcOpcode::vaddfp128 => {
// PPCBUG-435: same as vaddfp.
let a = ctx.vr[instr.va128()].as_f32x4(); let a = ctx.vr[instr.va128()].as_f32x4();
let b = ctx.vr[instr.vb128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4();
let mut r = [0f32; 4]; let mut r = [0f32; 4];
for i in 0..4 { r[i] = a[i] + b[i]; } for i in 0..4 {
let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]);
r[i] = vmx::flush_denorm(ai + bi);
}
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
} }
PpcOpcode::vsubfp => { PpcOpcode::vsubfp => {
// PPCBUG-435.
let a = ctx.vr[instr.ra()].as_f32x4(); let a = ctx.vr[instr.ra()].as_f32x4();
let b = ctx.vr[instr.rb()].as_f32x4(); let b = ctx.vr[instr.rb()].as_f32x4();
let mut r = [0f32; 4]; let mut r = [0f32; 4];
for i in 0..4 { r[i] = a[i] - b[i]; } for i in 0..4 {
let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]);
r[i] = vmx::flush_denorm(ai - bi);
}
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
} }
PpcOpcode::vsubfp128 => { PpcOpcode::vsubfp128 => {
// PPCBUG-435.
let a = ctx.vr[instr.va128()].as_f32x4(); let a = ctx.vr[instr.va128()].as_f32x4();
let b = ctx.vr[instr.vb128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4();
let mut r = [0f32; 4]; let mut r = [0f32; 4];
for i in 0..4 { r[i] = a[i] - b[i]; } for i in 0..4 {
let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]);
r[i] = vmx::flush_denorm(ai - bi);
}
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
} }
@@ -1982,7 +2004,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
let ai = vmx::flush_denorm(a[i]); let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]); let bi = vmx::flush_denorm(b[i]);
let ci = vmx::flush_denorm(c[i]); let ci = vmx::flush_denorm(c[i]);
r[i] = ai.mul_add(ci, bi); // PPCBUG-437: flush subnormal output too.
r[i] = vmx::flush_denorm(ai.mul_add(ci, bi));
} }
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
@@ -2000,7 +2023,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
let ai = vmx::flush_denorm(a[i]); let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]); let bi = vmx::flush_denorm(b[i]);
let di = vmx::flush_denorm(d[i]); let di = vmx::flush_denorm(d[i]);
r[i] = ai.mul_add(di, bi); // PPCBUG-437.
r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
} }
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
@@ -2015,7 +2039,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
let ai = vmx::flush_denorm(a[i]); let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]); let bi = vmx::flush_denorm(b[i]);
let ci = vmx::flush_denorm(c[i]); let ci = vmx::flush_denorm(c[i]);
r[i] = bi - ai * ci; r[i] = vmx::flush_denorm(bi - ai * ci);
} }
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
@@ -2032,16 +2056,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
let ai = vmx::flush_denorm(a[i]); let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]); let bi = vmx::flush_denorm(b[i]);
let di = vmx::flush_denorm(d[i]); let di = vmx::flush_denorm(d[i]);
r[i] = di - ai * bi; r[i] = vmx::flush_denorm(di - ai * bi);
} }
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
} }
PpcOpcode::vmulfp128 => { PpcOpcode::vmulfp128 => {
// PPCBUG-435 + PPCBUG-437.
let a = ctx.vr[instr.va128()].as_f32x4(); let a = ctx.vr[instr.va128()].as_f32x4();
let b = ctx.vr[instr.vb128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4();
let mut r = [0f32; 4]; let mut r = [0f32; 4];
for i in 0..4 { r[i] = a[i] * b[i]; } for i in 0..4 {
let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]);
r[i] = vmx::flush_denorm(ai * bi);
}
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
} }
@@ -4430,7 +4459,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
let ai = vmx::flush_denorm(a[i]); let ai = vmx::flush_denorm(a[i]);
let bi = vmx::flush_denorm(b[i]); let bi = vmx::flush_denorm(b[i]);
let di = vmx::flush_denorm(d[i]); let di = vmx::flush_denorm(d[i]);
r[i] = ai.mul_add(di, bi); // PPCBUG-437: flush subnormal output too.
r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
} }
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
ctx.pc += 4; ctx.pc += 4;
@@ -4439,16 +4469,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
// Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal
// unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075. // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075.
PpcOpcode::vmsum3fp128 => { PpcOpcode::vmsum3fp128 => {
// PPCBUG-436: flush per-product intermediates (not just the final sum).
let a = ctx.vr[instr.va128()].as_f32x4(); let a = ctx.vr[instr.va128()].as_f32x4();
let b = ctx.vr[instr.vb128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4();
let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]); let p0 = vmx::flush_denorm(a[0] * b[0]);
let p1 = vmx::flush_denorm(a[1] * b[1]);
let p2 = vmx::flush_denorm(a[2] * b[2]);
let s = vmx::flush_denorm(p0 + p1 + p2);
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
ctx.pc += 4; ctx.pc += 4;
} }
PpcOpcode::vmsum4fp128 => { PpcOpcode::vmsum4fp128 => {
// PPCBUG-436.
let a = ctx.vr[instr.va128()].as_f32x4(); let a = ctx.vr[instr.va128()].as_f32x4();
let b = ctx.vr[instr.vb128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4();
let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]); let p0 = vmx::flush_denorm(a[0] * b[0]);
let p1 = vmx::flush_denorm(a[1] * b[1]);
let p2 = vmx::flush_denorm(a[2] * b[2]);
let p3 = vmx::flush_denorm(a[3] * b[3]);
let s = vmx::flush_denorm(p0 + p1 + p2 + p3);
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s); ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
ctx.pc += 4; ctx.pc += 4;
} }