fix(cpu): PPCBUG-435/436/437 VSCR.NJ subnormal flush for VMX float
Phase 5 batch 4 (5d) — partial: VSCR.NJ subnormal flush for VMX float arithmetic. Xbox 360 always boots with NJ=1, so games expect inputs and outputs flushed to ±0. - PPCBUG-435 vaddfp/vaddfp128/vsubfp/vsubfp128/vmulfp128: previously no flush at all on these opcodes (only vmaddfp family flushed). Now flushes both inputs and output per Canary's unconditional model. - PPCBUG-436 vmsum3fp128/vmsum4fp128: per-product intermediates now flushed individually (was only the final sum). - PPCBUG-437 vmaddfp/vmaddfp128/vmaddcfp128/vnmsubfp/vnmsubfp128: outputs now flushed (inputs were already flushed). PPCBUG-185 (FPSCR.NI flush for scalar FPU) deferred — requires adding a NI bit constant and post-op flush wrapper across all *sx arms; will land in a focused sub-batch. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1940,34 +1940,56 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
|
||||
// ===== VMX: Float Arithmetic =====
|
||||
PpcOpcode::vaddfp => {
|
||||
// PPCBUG-435: VSCR.NJ=1 (Xbox 360 always boots with this set) requires
|
||||
// flush-to-zero on subnormal inputs and outputs. Canary VMX float
|
||||
// arithmetic flushes denormals unconditionally.
|
||||
let a = ctx.vr[instr.ra()].as_f32x4();
|
||||
let b = ctx.vr[instr.rb()].as_f32x4();
|
||||
let mut r = [0f32; 4];
|
||||
for i in 0..4 { r[i] = a[i] + b[i]; }
|
||||
for i in 0..4 {
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
r[i] = vmx::flush_denorm(ai + bi);
|
||||
}
|
||||
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::vaddfp128 => {
|
||||
// PPCBUG-435: same as vaddfp.
|
||||
let a = ctx.vr[instr.va128()].as_f32x4();
|
||||
let b = ctx.vr[instr.vb128()].as_f32x4();
|
||||
let mut r = [0f32; 4];
|
||||
for i in 0..4 { r[i] = a[i] + b[i]; }
|
||||
for i in 0..4 {
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
r[i] = vmx::flush_denorm(ai + bi);
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::vsubfp => {
|
||||
// PPCBUG-435.
|
||||
let a = ctx.vr[instr.ra()].as_f32x4();
|
||||
let b = ctx.vr[instr.rb()].as_f32x4();
|
||||
let mut r = [0f32; 4];
|
||||
for i in 0..4 { r[i] = a[i] - b[i]; }
|
||||
for i in 0..4 {
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
r[i] = vmx::flush_denorm(ai - bi);
|
||||
}
|
||||
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::vsubfp128 => {
|
||||
// PPCBUG-435.
|
||||
let a = ctx.vr[instr.va128()].as_f32x4();
|
||||
let b = ctx.vr[instr.vb128()].as_f32x4();
|
||||
let mut r = [0f32; 4];
|
||||
for i in 0..4 { r[i] = a[i] - b[i]; }
|
||||
for i in 0..4 {
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
r[i] = vmx::flush_denorm(ai - bi);
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
@@ -1982,7 +2004,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
let ci = vmx::flush_denorm(c[i]);
|
||||
r[i] = ai.mul_add(ci, bi);
|
||||
// PPCBUG-437: flush subnormal output too.
|
||||
r[i] = vmx::flush_denorm(ai.mul_add(ci, bi));
|
||||
}
|
||||
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
@@ -2000,7 +2023,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
let di = vmx::flush_denorm(d[i]);
|
||||
r[i] = ai.mul_add(di, bi);
|
||||
// PPCBUG-437.
|
||||
r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
@@ -2015,7 +2039,7 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
let ci = vmx::flush_denorm(c[i]);
|
||||
r[i] = bi - ai * ci;
|
||||
r[i] = vmx::flush_denorm(bi - ai * ci);
|
||||
}
|
||||
ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
@@ -2032,16 +2056,21 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
let di = vmx::flush_denorm(d[i]);
|
||||
r[i] = di - ai * bi;
|
||||
r[i] = vmx::flush_denorm(di - ai * bi);
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::vmulfp128 => {
|
||||
// PPCBUG-435 + PPCBUG-437.
|
||||
let a = ctx.vr[instr.va128()].as_f32x4();
|
||||
let b = ctx.vr[instr.vb128()].as_f32x4();
|
||||
let mut r = [0f32; 4];
|
||||
for i in 0..4 { r[i] = a[i] * b[i]; }
|
||||
for i in 0..4 {
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
r[i] = vmx::flush_denorm(ai * bi);
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
@@ -4430,7 +4459,8 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
let ai = vmx::flush_denorm(a[i]);
|
||||
let bi = vmx::flush_denorm(b[i]);
|
||||
let di = vmx::flush_denorm(d[i]);
|
||||
r[i] = ai.mul_add(di, bi);
|
||||
// PPCBUG-437: flush subnormal output too.
|
||||
r[i] = vmx::flush_denorm(ai.mul_add(di, bi));
|
||||
}
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
|
||||
ctx.pc += 4;
|
||||
@@ -4439,16 +4469,25 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
|
||||
// Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal
|
||||
// unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075.
|
||||
PpcOpcode::vmsum3fp128 => {
|
||||
// PPCBUG-436: flush per-product intermediates (not just the final sum).
|
||||
let a = ctx.vr[instr.va128()].as_f32x4();
|
||||
let b = ctx.vr[instr.vb128()].as_f32x4();
|
||||
let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
|
||||
let p0 = vmx::flush_denorm(a[0] * b[0]);
|
||||
let p1 = vmx::flush_denorm(a[1] * b[1]);
|
||||
let p2 = vmx::flush_denorm(a[2] * b[2]);
|
||||
let s = vmx::flush_denorm(p0 + p1 + p2);
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
PpcOpcode::vmsum4fp128 => {
|
||||
// PPCBUG-436.
|
||||
let a = ctx.vr[instr.va128()].as_f32x4();
|
||||
let b = ctx.vr[instr.vb128()].as_f32x4();
|
||||
let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]);
|
||||
let p0 = vmx::flush_denorm(a[0] * b[0]);
|
||||
let p1 = vmx::flush_denorm(a[1] * b[1]);
|
||||
let p2 = vmx::flush_denorm(a[2] * b[2]);
|
||||
let p3 = vmx::flush_denorm(a[3] * b[3]);
|
||||
let s = vmx::flush_denorm(p0 + p1 + p2 + p3);
|
||||
ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
|
||||
ctx.pc += 4;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user