Files
xenia-rs/crates/xenia-cpu/src/vmx.rs
MechaCat02 6fe2cbf251 fix(cpu): PPCBUG-426/427/433 single-FMA vnmsubfp + vctsxs NaN saturation
Phase 5 batch 6 (5f): saturation and FMA-rounding fixes.

- PPCBUG-426 vnmsubfp: was `bi - ai * ci` (two rounding steps); now
  `-ai.mul_add(ci, -bi)` which is mathematically equivalent (= bi - ai*ci)
  but uses a single FMA round per ISA.
- PPCBUG-427 vnmsubfp128: same single-FMA fix.
- PPCBUG-433 vctsxs / vcfpsxws128 NaN saturation: AltiVec ISA saturates
  NaN to INT_MIN (0x80000000); xenia returned 0. The vctuxs (unsigned)
  NaN→0 is correct per ISA.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-02 12:31:10 +02:00

921 lines
34 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! VMX / AltiVec helper routines shared by the interpreter's 150+ vector
//! opcode handlers.
//!
//! Big-endian lane indexing throughout: `Vec128::bytes[0]` is the most
//! significant byte, which corresponds to PowerPC lane 0. Operations that
//! care about "even" vs "odd" lanes follow the PPC convention (lane 0 = most
//! significant = "even" for multiply-even/odd purposes).
use xenia_memory::MemoryAccess;
use xenia_types::Vec128;
// ─── Lane accessors ────────────────────────────────────────────────────────
#[inline] pub fn as_i8x16(v: Vec128) -> [i8; 16] {
let b = v.as_bytes();
let mut r = [0i8; 16];
for i in 0..16 { r[i] = b[i] as i8; }
r
}
#[inline] pub fn as_i16x8(v: Vec128) -> [i16; 8] {
let u = v.as_u16x8();
[u[0] as i16, u[1] as i16, u[2] as i16, u[3] as i16,
u[4] as i16, u[5] as i16, u[6] as i16, u[7] as i16]
}
#[inline] pub fn as_i32x4(v: Vec128) -> [i32; 4] {
let u = v.as_u32x4();
[u[0] as i32, u[1] as i32, u[2] as i32, u[3] as i32]
}
#[inline] pub fn from_i8x16(r: [i8; 16]) -> Vec128 {
let mut b = [0u8; 16];
for i in 0..16 { b[i] = r[i] as u8; }
Vec128::from_bytes(b)
}
#[inline] pub fn from_i16x8(r: [i16; 8]) -> Vec128 {
Vec128::from_u16x8_array([
r[0] as u16, r[1] as u16, r[2] as u16, r[3] as u16,
r[4] as u16, r[5] as u16, r[6] as u16, r[7] as u16,
])
}
#[inline] pub fn from_i32x4(r: [i32; 4]) -> Vec128 {
Vec128::from_u32x4_array([r[0] as u32, r[1] as u32, r[2] as u32, r[3] as u32])
}
// ─── Saturation helpers ────────────────────────────────────────────────────
// Each returns (clamped_value, saturated_flag). Handlers OR the flags together
// and call `ctx.set_vscr_sat(true)` once per instruction.
#[inline] pub fn sat_add_u8(a: u8, b: u8) -> (u8, bool) {
let s = a as u16 + b as u16;
if s > u8::MAX as u16 { (u8::MAX, true) } else { (s as u8, false) }
}
#[inline] pub fn sat_sub_u8(a: u8, b: u8) -> (u8, bool) {
if a >= b { (a - b, false) } else { (0, true) }
}
#[inline] pub fn sat_add_i8(a: i8, b: i8) -> (i8, bool) {
let s = a as i16 + b as i16;
if s > i8::MAX as i16 { (i8::MAX, true) }
else if s < i8::MIN as i16 { (i8::MIN, true) }
else { (s as i8, false) }
}
#[inline] pub fn sat_sub_i8(a: i8, b: i8) -> (i8, bool) {
let s = a as i16 - b as i16;
if s > i8::MAX as i16 { (i8::MAX, true) }
else if s < i8::MIN as i16 { (i8::MIN, true) }
else { (s as i8, false) }
}
#[inline] pub fn sat_add_u16(a: u16, b: u16) -> (u16, bool) {
let s = a as u32 + b as u32;
if s > u16::MAX as u32 { (u16::MAX, true) } else { (s as u16, false) }
}
#[inline] pub fn sat_sub_u16(a: u16, b: u16) -> (u16, bool) {
if a >= b { (a - b, false) } else { (0, true) }
}
#[inline] pub fn sat_add_i16(a: i16, b: i16) -> (i16, bool) {
let s = a as i32 + b as i32;
if s > i16::MAX as i32 { (i16::MAX, true) }
else if s < i16::MIN as i32 { (i16::MIN, true) }
else { (s as i16, false) }
}
#[inline] pub fn sat_sub_i16(a: i16, b: i16) -> (i16, bool) {
let s = a as i32 - b as i32;
if s > i16::MAX as i32 { (i16::MAX, true) }
else if s < i16::MIN as i32 { (i16::MIN, true) }
else { (s as i16, false) }
}
#[inline] pub fn sat_add_u32(a: u32, b: u32) -> (u32, bool) {
let s = a as u64 + b as u64;
if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }
}
#[inline] pub fn sat_sub_u32(a: u32, b: u32) -> (u32, bool) {
if a >= b { (a - b, false) } else { (0, true) }
}
#[inline] pub fn sat_add_i32(a: i32, b: i32) -> (i32, bool) {
let s = a as i64 + b as i64;
if s > i32::MAX as i64 { (i32::MAX, true) }
else if s < i32::MIN as i64 { (i32::MIN, true) }
else { (s as i32, false) }
}
#[inline] pub fn sat_sub_i32(a: i32, b: i32) -> (i32, bool) {
let s = a as i64 - b as i64;
if s > i32::MAX as i64 { (i32::MAX, true) }
else if s < i32::MIN as i64 { (i32::MIN, true) }
else { (s as i32, false) }
}
// Pack-with-saturation helpers — clamp a wider integer to the narrower type.
#[inline] pub fn sat_i16_to_i8(v: i16) -> (i8, bool) {
if v > i8::MAX as i16 { (i8::MAX, true) }
else if v < i8::MIN as i16 { (i8::MIN, true) }
else { (v as i8, false) }
}
#[inline] pub fn sat_i16_to_u8(v: i16) -> (u8, bool) {
if v < 0 { (0, true) }
else if v > u8::MAX as i16 { (u8::MAX, true) }
else { (v as u8, false) }
}
#[inline] pub fn sat_u16_to_u8(v: u16) -> (u8, bool) {
if v > u8::MAX as u16 { (u8::MAX, true) } else { (v as u8, false) }
}
#[inline] pub fn sat_i32_to_i16(v: i32) -> (i16, bool) {
if v > i16::MAX as i32 { (i16::MAX, true) }
else if v < i16::MIN as i32 { (i16::MIN, true) }
else { (v as i16, false) }
}
#[inline] pub fn sat_i32_to_u16(v: i32) -> (u16, bool) {
if v < 0 { (0, true) }
else if v > u16::MAX as i32 { (u16::MAX, true) }
else { (v as u16, false) }
}
#[inline] pub fn sat_u32_to_u16(v: u32) -> (u16, bool) {
if v > u16::MAX as u32 { (u16::MAX, true) } else { (v as u16, false) }
}
#[inline] pub fn sat_i64_to_i32(v: i64) -> (i32, bool) {
if v > i32::MAX as i64 { (i32::MAX, true) }
else if v < i32::MIN as i64 { (i32::MIN, true) }
else { (v as i32, false) }
}
#[inline] pub fn sat_i64_to_u32(v: i64) -> (u32, bool) {
if v < 0 { (0, true) }
else if v > u32::MAX as i64 { (u32::MAX, true) }
else { (v as u32, false) }
}
// ─── Averages ──────────────────────────────────────────────────────────────
// PPC avg is rounded up: (a + b + 1) / 2.
#[inline] pub fn avg_u8(a: u8, b: u8) -> u8 {
((a as u16 + b as u16 + 1) >> 1) as u8
}
#[inline] pub fn avg_u16(a: u16, b: u16) -> u16 {
((a as u32 + b as u32 + 1) >> 1) as u16
}
#[inline] pub fn avg_u32(a: u32, b: u32) -> u32 {
((a as u64 + b as u64 + 1) >> 1) as u32
}
#[inline] pub fn avg_i8(a: i8, b: i8) -> i8 {
((a as i32 + b as i32 + 1) >> 1) as i8
}
#[inline] pub fn avg_i16(a: i16, b: i16) -> i16 {
((a as i32 + b as i32 + 1) >> 1) as i16
}
#[inline] pub fn avg_i32(a: i32, b: i32) -> i32 {
((a as i64 + b as i64 + 1) >> 1) as i32
}
// ─── NaN-aware f32 min/max for vmaxfp / vminfp ────────────────────────────
//
// Altivec PEM: "If either element of vA or vB is a NaN, the corresponding
// element of vD is set to the quiet NaN form of that NaN". Rust's `>` / `<`
// comparison with NaN always returns false, so `if a > b { a } else { b }`
// would silently pick `b` whenever `a` is NaN — losing NaN propagation.
#[inline]
pub fn max_nan(a: f32, b: f32) -> f32 {
if a.is_nan() { quiet_nan(a) }
else if b.is_nan() { quiet_nan(b) }
else if a > b { a } else { b }
}
#[inline]
pub fn min_nan(a: f32, b: f32) -> f32 {
if a.is_nan() { quiet_nan(a) }
else if b.is_nan() { quiet_nan(b) }
else if a < b { a } else { b }
}
/// Convert an SNaN to QNaN by setting the high mantissa bit. A QNaN is
/// returned unchanged.
#[inline]
pub fn quiet_nan(x: f32) -> f32 {
if !x.is_nan() { return x; }
f32::from_bits(x.to_bits() | 0x0040_0000)
}
/// Flush a subnormal f32 to ±0 (preserving the sign). Used by vmaddfp family,
/// vctsxs / vctuxs, and any instruction whose AltiVec definition specifies
/// input-side denormal flushing regardless of VSCR[NJ].
#[inline]
pub fn flush_denorm(x: f32) -> f32 {
if x.is_subnormal() {
if x.is_sign_negative() { -0.0 } else { 0.0 }
} else {
x
}
}
// ─── Float ⇄ fixed-point conversions (scaled by 2^scale_bits) ─────────────
//
// vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
#[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
// PPCBUG-433: AltiVec ISA saturates NaN to INT_MIN (0x80000000), not 0.
// (vctuxs's NaN→0 is correct per AltiVec ISA — see PPCBUG-434.)
if x.is_nan() { return (i32::MIN, true); }
let x = flush_denorm(x);
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
if scaled >= i32::MAX as f64 { return (i32::MAX, true); }
if scaled <= i32::MIN as f64 { return (i32::MIN, true); }
(scaled.trunc() as i32, false)
}
#[inline] pub fn cvt_f32_to_u32_sat(x: f32, scale_bits: u32) -> (u32, bool) {
if x.is_nan() { return (0, true); }
let x = flush_denorm(x);
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
if scaled < 0.0 { return (0, true); }
if scaled > u32::MAX as f64 { return (u32::MAX, true); }
(scaled.trunc() as u32, false)
}
#[inline] pub fn cvt_i32_to_f32(v: i32, scale_bits: u32) -> f32 {
(v as f64 / (1u64 << scale_bits) as f64) as f32
}
#[inline] pub fn cvt_u32_to_f32(v: u32, scale_bits: u32) -> f32 {
(v as f64 / (1u64 << scale_bits) as f64) as f32
}
// ─── Unaligned vector load/store ──────────────────────────────────────────
//
// lvlx/lvrx and stvlx/stvrx combine to perform any unaligned 16-byte access:
// lvlx(EA) | lvrx(EA + 16) loads 16 bytes starting at unaligned EA.
// stvlx(EA); stvrx(EA + 16) stores 16 bytes starting at unaligned EA.
//
// Semantics per the AltiVec manual (and xenia-canary ppc_emit_memory.cc):
// lvlx: shift = EA & 0xF, n = 16 - shift. Loads mem[EA..EA+n] into
// lanes VR[0..n], zeros VR[n..16].
// lvrx: shift = EA & 0xF. If shift == 0, VR = 0. Otherwise loads
// mem[EA-shift..EA] into lanes VR[16-shift..16], zeros VR[0..16-shift].
// stvlx / stvrx are the symmetric stores.
//
// `Vec128::bytes[0]` is the most significant byte (PPC lane 0 in BE view).
pub fn load_vector_left(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
let shift = (ea & 0xF) as usize;
let n = 16 - shift;
let mut bytes = [0u8; 16];
for i in 0..n {
bytes[i] = mem.read_u8(ea.wrapping_add(i as u32));
}
Vec128::from_bytes(bytes)
}
pub fn load_vector_right(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
let shift = (ea & 0xF) as usize;
if shift == 0 { return Vec128::ZERO; }
let base = ea & !0xFu32;
let mut bytes = [0u8; 16];
for i in 0..shift {
bytes[16 - shift + i] = mem.read_u8(base.wrapping_add(i as u32));
}
Vec128::from_bytes(bytes)
}
pub fn store_vector_left(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
let shift = (ea & 0xF) as usize;
let n = 16 - shift;
let b = v.as_bytes();
for i in 0..n {
mem.write_u8(ea.wrapping_add(i as u32), b[i]);
}
}
pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
let shift = (ea & 0xF) as usize;
if shift == 0 { return; }
let base = ea & !0xFu32;
let b = v.as_bytes();
for i in 0..shift {
mem.write_u8(base.wrapping_add(i as u32), b[16 - shift + i]);
}
}
// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ─────────────────────────
// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel.
// vupkhpx / vupklpx reverse the operation.
//
// Format: input 32-bit word holds
// bits 0-6: unused (0)
// bit 7: alpha-select (→ bit 15 of output)
// bits 8-15: R (top 5 bits kept)
// bits 16-23: G (top 5 bits kept)
// bits 24-31: B (top 5 bits kept)
// Output 16-bit word:
// bit 15: A (from input bit 7)
// bits 10-14: R
// bits 5-9: G
// bits 0-4: B
#[inline] pub fn pack_pixel_555(input: u32) -> u16 {
let a = (input >> 7) & 0x1;
let r = (input >> 8) & 0xFF;
let g = (input >> 16) & 0xFF;
let b = (input >> 24) & 0xFF;
((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16
}
#[inline] pub fn unpack_pixel_555(input: u16) -> u32 {
let input = input as u32;
let a = (input >> 15) & 0x1;
let r = (input >> 10) & 0x1F;
let g = (input >> 5) & 0x1F;
let b = input & 0x1F;
// Sign-extend A and replicate 5-bit RGB into the top of each byte.
let a8 = if a != 0 { 0xFFu32 } else { 0 };
let r8 = (r << 3) | (r >> 2);
let g8 = (g << 3) | (g >> 2);
let b8 = (b << 3) | (b >> 2);
(a8 << 24) | (r8 << 16) | (g8 << 8) | b8
}
// ─── VMX128 D3D pack/unpack dispatch ──────────────────────────────────────
// `vpkd3d128` / `vupkd3d128` encode a small enum in the instruction word
// (VX128_4 immediate field). The exact enum lives in canary's
// ppc_emit_altivec.cc under PACK_TYPE_*; titles usually touch D3DCOLOR
// (type 0) and a handful of texture-coordinate variants.
//
// Rather than risk getting a rarely-used sub-case wrong, we implement the
// common types and fall back to a warning + pass-through for unknown types.
// Returning the VB register value unchanged is always preferable to emitting
// StepResult::Unimplemented because it keeps the interpreter running.
/// Pack-type encoding of `vpkd3d128` / `vupkd3d128`.
///
/// The immediate field lives at PPC bits 16-22 (VX128_3/4 IMM, 7 bits).
/// Canary decodes `type = IMM >> 2` (top 5 bits) and `pack = IMM & 0x3`
/// (low 2 bits, used only by `vpkd3d128` to select output-slot layout).
/// Valid `type` values are 0..=6 per `ppc_emit_altivec.cc:2095-2118`:
///
/// | id | canary name | format |
/// |----|-------------------|---------------------------------------|
/// | 0 | VPACK_D3DCOLOR | 4 f32 [0,1] ↔ ARGB8 |
/// | 1 | VPACK_NORMSHORT2 | 2 f32 [-1,1] ↔ 2× signed-normalized i16 |
/// | 2 | VPACK_NORMPACKED32| 4 f32 [-1,1] ↔ UINT_2101010 (w:2,z:10,y:10,x:10) |
/// | 3 | VPACK_FLOAT16_2 | 2 f32 ↔ 2× fp16 |
/// | 4 | VPACK_NORMSHORT4 | 4 f32 [-1,1] ↔ 4× signed-normalized i16 |
/// | 5 | VPACK_FLOAT16_4 | 4 f32 ↔ 4× fp16 |
/// | 6 | VPACK_NORMPACKED64| 4 f32 [-1,1] ↔ ULONG_4202020 (w:4,z:20,y:20,x:20) |
///
/// Prior (M3-pre) this enum listed made-up "Normal16"/"Normal8"/"UByteN4"
/// variants that didn't match canary; the immediate extraction was also
/// wrong (LSB-numbered `>>6 & 0x7` instead of MSB-numbered `>>11 & 0x1F`
/// against a 7-bit IMM field). M3 fixes both.
#[derive(Debug, Clone, Copy)]
pub enum D3dPackType {
D3dColor,
NormShort2,
NormPacked32,
Float16_2,
NormShort4,
Float16_4,
NormPacked64,
Other(u32),
}
impl D3dPackType {
/// Decode the `type` bits extracted from the VX128_3/4 IMM field via
/// canary's `IMM >> 2` convention (i.e. the caller has already divided
/// out the 2-bit `pack` subfield).
pub fn from_immediate(type_bits: u32) -> Self {
match type_bits {
0 => Self::D3dColor,
1 => Self::NormShort2,
2 => Self::NormPacked32,
3 => Self::Float16_2,
4 => Self::NormShort4,
5 => Self::Float16_4,
6 => Self::NormPacked64,
other => Self::Other(other),
}
}
}
/// Pack an f32x4 vector of [R, G, B, A] in [0.0, 1.0] into a single D3DCOLOR
/// value in lane 3 of the output.
pub fn pack_d3dcolor(v: Vec128) -> Vec128 {
let f = v.as_f32x4();
let to_byte = |x: f32| -> u32 {
let c = x.clamp(0.0, 1.0) * 255.0;
(c + 0.5) as u32 & 0xFF
};
// D3DCOLOR is A,R,G,B in that byte order inside a u32.
let word = (to_byte(f[3]) << 24) | (to_byte(f[0]) << 16) | (to_byte(f[1]) << 8) | to_byte(f[2]);
Vec128::from_u32x4(0, 0, 0, word)
}
/// Unpack a D3DCOLOR value (in lane 3 of the input) into an f32x4 [R, G, B, A].
pub fn unpack_d3dcolor(v: Vec128) -> Vec128 {
let word = v.u32x4(3);
let a = ((word >> 24) & 0xFF) as f32 / 255.0;
let r = ((word >> 16) & 0xFF) as f32 / 255.0;
let g = ((word >> 8) & 0xFF) as f32 / 255.0;
let b = (word & 0xFF) as f32 / 255.0;
Vec128::from_f32x4(r, g, b, a)
}
// ───────────────────────────────────────────────────────────────────────
// First-Pixels M3 — pack/unpack for the remaining canary pack types.
//
// Conventions shared across all helpers:
// * Input-to-`unpack_*` (packed data) lives in the *source* lane position
// canary's HIR assumes: canonically the 32-bit word is in lane 3 and
// the 64-bit value straddles lanes 2-3. We match that so the existing
// D3DCOLOR helpers' 3-lane convention is preserved across the whole
// pack-type family.
// * Output-from-`pack_*` sits in the same lane(s). The caller usually
// follows with a permute to move it elsewhere (the VX128_4 `pack`
// subfield controls that in `vpkd3d128`).
// * Range semantics match canary: normalized types use `max` = (1<<N-1)-1
// for signed, clamp before rounding.
// ───────────────────────────────────────────────────────────────────────
#[inline]
fn norm_to_i16(x: f32) -> i16 {
let c = x.clamp(-1.0, 1.0) * 32767.0;
// Round half away from zero, matching canary's `vcfsx` semantics.
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
r.clamp(-32768, 32767) as i16
}
#[inline]
fn i16_to_norm(s: i16) -> f32 {
(s as f32) / 32767.0
}
/// **NORMSHORT2** — 2 f32s in [-1, 1] → two 16-bit signed-normalized
/// shorts packed as `(x << 16) | y` in lane 3 (high 32 bits of the word
/// hold X; low 16 hold Y). Output lanes 0..=2 are zero-filled.
pub fn pack_normshort2(v: Vec128) -> Vec128 {
let f = v.as_f32x4();
let x = norm_to_i16(f[0]) as u16 as u32;
let y = norm_to_i16(f[1]) as u16 as u32;
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
}
pub fn unpack_normshort2(v: Vec128) -> Vec128 {
let word = v.u32x4(3);
let x = i16_to_norm((word >> 16) as i16);
let y = i16_to_norm(word as i16);
Vec128::from_f32x4(x, y, 0.0, 1.0)
}
/// **NORMSHORT4** — 4 f32s in [-1, 1] → four 16-bit signed-normalized
/// shorts packed across lanes 2-3 (big-endian dword order: X in the
/// high word of lane 2, Y low of lane 2, Z high of lane 3, W low of lane
/// 3).
pub fn pack_normshort4(v: Vec128) -> Vec128 {
let f = v.as_f32x4();
let x = norm_to_i16(f[0]) as u16 as u32;
let y = norm_to_i16(f[1]) as u16 as u32;
let z = norm_to_i16(f[2]) as u16 as u32;
let w = norm_to_i16(f[3]) as u16 as u32;
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
}
pub fn unpack_normshort4(v: Vec128) -> Vec128 {
let hi = v.u32x4(2);
let lo = v.u32x4(3);
let x = i16_to_norm((hi >> 16) as i16);
let y = i16_to_norm(hi as i16);
let z = i16_to_norm((lo >> 16) as i16);
let w = i16_to_norm(lo as i16);
Vec128::from_f32x4(x, y, z, w)
}
/// **NORMPACKED32** — UINT_2101010 layout, 4 f32s in [-1, 1] packed into
/// 32 bits in lane 3. Per canary's comment `2_10_10_10 w_z_y_x`: the
/// high 2 bits hold W (signed 2-bit, -2..=1), then Z/Y/X each use 10
/// signed-normalized bits.
pub fn pack_normpacked32(v: Vec128) -> Vec128 {
let f = v.as_f32x4();
#[inline]
fn n10(x: f32) -> u32 {
let c = x.clamp(-1.0, 1.0) * 511.0;
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
(r.clamp(-512, 511) as i32 as u32) & 0x3FF
}
#[inline]
fn n2(x: f32) -> u32 {
let c = x.clamp(-1.0, 1.0) * 1.0;
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
(r.clamp(-2, 1) as i32 as u32) & 0x3
}
let x = n10(f[0]);
let y = n10(f[1]);
let z = n10(f[2]);
let w = n2(f[3]);
let word = (w << 30) | (z << 20) | (y << 10) | x;
Vec128::from_u32x4(0, 0, 0, word)
}
pub fn unpack_normpacked32(v: Vec128) -> Vec128 {
let word = v.u32x4(3);
#[inline]
fn u10_to_norm(bits: u32) -> f32 {
// Sign-extend the 10-bit field then normalize.
let s = ((bits & 0x3FF) as i32) << 22 >> 22;
(s as f32) / 511.0
}
#[inline]
fn u2_to_norm(bits: u32) -> f32 {
let s = ((bits & 0x3) as i32) << 30 >> 30;
(s as f32).clamp(-1.0, 1.0)
}
let x = u10_to_norm(word);
let y = u10_to_norm(word >> 10);
let z = u10_to_norm(word >> 20);
let w = u2_to_norm(word >> 30);
Vec128::from_f32x4(x, y, z, w)
}
/// **NORMPACKED64** — ULONG_4202020, 4 f32s in [-1, 1] packed into 64
/// bits across lanes 2-3. Per canary's comment `4_20_20_20 w_z_y_x`:
/// the high 4 bits of the dword hold W (signed 4-bit); the remaining 60
/// bits hold 3× 20-bit signed-normalized Z/Y/X. Rare outside very few
/// titles (canary notes 54540829).
pub fn pack_normpacked64(v: Vec128) -> Vec128 {
let f = v.as_f32x4();
#[inline]
fn n20(x: f32) -> u64 {
let c = x.clamp(-1.0, 1.0) * 524287.0; // 2^19 - 1
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
(r.clamp(-524288, 524287) as i64 as u64) & 0xF_FFFF
}
#[inline]
fn n4(x: f32) -> u64 {
let c = x.clamp(-1.0, 1.0) * 7.0;
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
(r.clamp(-8, 7) as i64 as u64) & 0xF
}
let x = n20(f[0]);
let y = n20(f[1]);
let z = n20(f[2]);
let w = n4(f[3]);
let dw: u64 = (w << 60) | (z << 40) | (y << 20) | x;
Vec128::from_u32x4(0, 0, (dw >> 32) as u32, dw as u32)
}
pub fn unpack_normpacked64(v: Vec128) -> Vec128 {
let hi = v.u32x4(2) as u64;
let lo = v.u32x4(3) as u64;
let dw = (hi << 32) | lo;
#[inline]
fn u20_to_norm(bits: u64) -> f32 {
let s = ((bits & 0xF_FFFF) as i64) << 44 >> 44;
(s as f32) / 524287.0
}
#[inline]
fn u4_to_norm(bits: u64) -> f32 {
let s = ((bits & 0xF) as i64) << 60 >> 60;
(s as f32) / 7.0
}
let x = u20_to_norm(dw);
let y = u20_to_norm(dw >> 20);
let z = u20_to_norm(dw >> 40);
let w = u4_to_norm(dw >> 60);
Vec128::from_f32x4(x, y, z, w)
}
/// IEEE 754 half-precision float pack/unpack — used by both FLOAT16_2
/// and FLOAT16_4. No FMA quirks involved; we go via `f32::to_bits` and
/// manual bit-twiddling (the stable-Rust `f16` type isn't available
/// yet).
#[inline]
fn f32_to_f16_bits(f: f32) -> u16 {
let bits = f.to_bits();
let sign = ((bits >> 31) & 0x1) as u16;
let exp = ((bits >> 23) & 0xFF) as i32;
let mant = bits & 0x7FFFFF;
// Handle the easy cases first.
if exp == 0xFF {
// NaN or infinity.
let half_exp = 0x1F;
let half_mant = if mant != 0 { 0x200 } else { 0 }; // quiet NaN / zero mantissa for Inf
return (sign << 15) | (half_exp << 10) | half_mant;
}
let unbiased_exp = exp - 127;
if unbiased_exp >= 16 {
// Overflow → infinity.
return (sign << 15) | (0x1F << 10);
}
if unbiased_exp <= -15 {
// Denormal or zero. Compute the shift and subnormal mantissa;
// anything too small flushes to signed zero.
if unbiased_exp < -24 {
return sign << 15;
}
let shift = -14 - unbiased_exp as i32; // amount to shift the implicit-1'd mantissa
let full_mant = 0x800000 | mant; // 24 bits with implicit leading 1
let half_mant = (full_mant >> (shift + 13)) as u16;
return (sign << 15) | half_mant;
}
let half_exp = ((unbiased_exp + 15) as u16) & 0x1F;
let half_mant = (mant >> 13) as u16;
(sign << 15) | (half_exp << 10) | half_mant
}
#[inline]
fn f16_bits_to_f32(h: u16) -> f32 {
let sign = ((h >> 15) & 0x1) as u32;
let exp = ((h >> 10) & 0x1F) as i32;
let mant = (h & 0x3FF) as u32;
let bits = if exp == 0x1F {
// NaN or infinity.
let f32_exp = 0xFFu32;
let f32_mant = if mant != 0 { 0x400000 } else { 0 };
(sign << 31) | (f32_exp << 23) | f32_mant
} else if exp == 0 && mant == 0 {
// Signed zero.
sign << 31
} else if exp == 0 {
// Subnormal — renormalize.
let mut e = -14i32;
let mut m = mant;
while (m & 0x400) == 0 {
m <<= 1;
e -= 1;
}
let f32_exp = ((e + 127) as u32) & 0xFF;
let f32_mant = (m & 0x3FF) << 13;
(sign << 31) | (f32_exp << 23) | f32_mant
} else {
let f32_exp = ((exp - 15 + 127) as u32) & 0xFF;
let f32_mant = mant << 13;
(sign << 31) | (f32_exp << 23) | f32_mant
};
f32::from_bits(bits)
}
/// **FLOAT16_2** — two 32-bit floats → two half-floats packed into one
/// 32-bit word (X in high 16 bits of lane 3, Y in low 16).
pub fn pack_float16_2(v: Vec128) -> Vec128 {
let f = v.as_f32x4();
let x = f32_to_f16_bits(f[0]) as u32;
let y = f32_to_f16_bits(f[1]) as u32;
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
}
pub fn unpack_float16_2(v: Vec128) -> Vec128 {
let word = v.u32x4(3);
let x = f16_bits_to_f32((word >> 16) as u16);
let y = f16_bits_to_f32(word as u16);
Vec128::from_f32x4(x, y, 0.0, 1.0)
}
/// **FLOAT16_4** — four 32-bit floats → four half-floats packed across
/// 64 bits (lanes 2-3).
pub fn pack_float16_4(v: Vec128) -> Vec128 {
let f = v.as_f32x4();
let x = f32_to_f16_bits(f[0]) as u32;
let y = f32_to_f16_bits(f[1]) as u32;
let z = f32_to_f16_bits(f[2]) as u32;
let w = f32_to_f16_bits(f[3]) as u32;
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
}
pub fn unpack_float16_4(v: Vec128) -> Vec128 {
let hi = v.u32x4(2);
let lo = v.u32x4(3);
let x = f16_bits_to_f32((hi >> 16) as u16);
let y = f16_bits_to_f32(hi as u16);
let z = f16_bits_to_f32((lo >> 16) as u16);
let w = f16_bits_to_f32(lo as u16);
Vec128::from_f32x4(x, y, z, w)
}
// ─── CR6 helpers used by integer compares ─────────────────────────────────
// vcmp*. (record-form) updates CR6 in a compressed form:
// CR6 = {all-true, 0, all-false, 0}
// where each bit reflects the per-lane mask across the whole register.
#[inline] pub fn cr6_flags_from_mask(mask: Vec128) -> (bool, bool) {
let b = mask.as_bytes();
let mut any_set = false;
let mut any_clear = false;
for &byte in b.iter() {
if byte != 0 { any_set = true; }
if byte != 0xFF { any_clear = true; }
}
let all_true = !any_clear;
let all_false = !any_set;
(all_true, all_false)
}
#[cfg(test)]
mod tests {
use super::*;
use std::cell::Cell;
struct TestMem { data: Box<[Cell<u8>]> }
impl TestMem {
fn new(size: usize) -> Self {
Self { data: (0..size).map(|_| Cell::new(0)).collect() }
}
}
impl MemoryAccess for TestMem {
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
fn read_u16(&self, a: u32) -> u16 {
u16::from_be_bytes([self.data[a as usize].get(), self.data[a as usize + 1].get()])
}
fn read_u32(&self, a: u32) -> u32 {
let a = a as usize;
u32::from_be_bytes([
self.data[a].get(), self.data[a+1].get(),
self.data[a+2].get(), self.data[a+3].get(),
])
}
fn read_u64(&self, a: u32) -> u64 {
let a = a as usize;
u64::from_be_bytes([
self.data[a].get(), self.data[a+1].get(),
self.data[a+2].get(), self.data[a+3].get(),
self.data[a+4].get(), self.data[a+5].get(),
self.data[a+6].get(), self.data[a+7].get(),
])
}
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
fn write_u16(&self, a: u32, v: u16) {
let b = v.to_be_bytes();
self.data[a as usize].set(b[0]);
self.data[a as usize + 1].set(b[1]);
}
fn write_u32(&self, a: u32, v: u32) {
let b = v.to_be_bytes(); let a = a as usize;
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
}
fn write_u64(&self, a: u32, v: u64) {
let b = v.to_be_bytes(); let a = a as usize;
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
}
fn translate(&self, _a: u32) -> Option<*const u8> { None }
fn translate_mut(&self, _a: u32) -> Option<*mut u8> { None }
}
#[test]
fn lvlx_lvrx_round_trip() {
let m = TestMem::new(0x40);
for i in 0..0x30 { m.data[i].set((i as u8).wrapping_add(0x10)); }
// Unaligned load from 0x13 should combine lvlx(0x13) | lvrx(0x23).
let lo = load_vector_left(&m, 0x13);
let hi = load_vector_right(&m, 0x23);
let mut combined = [0u8; 16];
let lob = lo.as_bytes();
let hib = hi.as_bytes();
for i in 0..16 { combined[i] = lob[i] | hib[i]; }
for i in 0..16 {
assert_eq!(combined[i], m.data[0x13 + i].get(), "lane {}", i);
}
}
#[test]
fn lvlx_aligned_is_full_load() {
let m = TestMem::new(0x20);
for i in 0..0x20 { m.data[i].set(i as u8); }
let v = load_vector_left(&m, 0x10);
let b = v.as_bytes();
for i in 0..16 { assert_eq!(b[i], 0x10 + i as u8); }
}
#[test]
fn lvrx_aligned_is_zero() {
let m = TestMem::new(0x20);
let v = load_vector_right(&m, 0x10);
assert_eq!(v.as_bytes(), [0u8; 16]);
}
#[test]
fn sat_add_signed_overflow() {
assert_eq!(sat_add_i8(120, 10), (127, true));
assert_eq!(sat_add_i8(-120, -10), (-128, true));
assert_eq!(sat_add_i8(1, 2), (3, false));
}
#[test]
fn sat_sub_unsigned_underflow() {
assert_eq!(sat_sub_u8(5, 10), (0, true));
assert_eq!(sat_sub_u8(10, 5), (5, false));
}
#[test]
fn pack_unpack_pixel_555() {
let encoded = pack_pixel_555(0x80_F8_F8_F8);
assert_eq!(encoded & 0x8000, 0x8000);
let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F);
assert_eq!(w & 0xFF000000, 0xFF000000);
}
// ─── First-Pixels M3 pack/unpack roundtrip tests ───
/// Quantization error tolerance for N-bit signed normalized values.
/// `1.0 / ((1 << (bits - 1)) - 1)` is the step size.
fn tol_normalized(bits: u32) -> f32 {
1.0 / ((1u32 << (bits - 1)) - 1) as f32
}
#[test]
fn normshort2_roundtrip() {
let v = Vec128::from_f32x4(0.5, -0.75, 0.0, 0.0);
let packed = pack_normshort2(v);
let back = unpack_normshort2(packed).as_f32x4();
let tol = tol_normalized(16);
assert!((back[0] - 0.5).abs() < tol, "x got {}", back[0]);
assert!((back[1] - -0.75).abs() < tol, "y got {}", back[1]);
assert_eq!(back[2], 0.0);
assert_eq!(back[3], 1.0);
}
#[test]
fn normshort4_roundtrip_extremes() {
let v = Vec128::from_f32x4(1.0, -1.0, 0.0, 0.25);
let packed = pack_normshort4(v);
let back = unpack_normshort4(packed).as_f32x4();
let tol = tol_normalized(16);
assert!((back[0] - 1.0).abs() < tol);
assert!((back[1] - -1.0).abs() < tol);
assert!((back[2] - 0.0).abs() < tol);
assert!((back[3] - 0.25).abs() < tol);
}
#[test]
fn normpacked32_roundtrip() {
let v = Vec128::from_f32x4(0.5, -0.5, 0.9, -1.0);
let packed = pack_normpacked32(v);
let back = unpack_normpacked32(packed).as_f32x4();
let tol10 = tol_normalized(10);
let tol2 = tol_normalized(2);
assert!((back[0] - 0.5).abs() < tol10, "x got {}", back[0]);
assert!((back[1] - -0.5).abs() < tol10, "y got {}", back[1]);
assert!((back[2] - 0.9).abs() < tol10, "z got {}", back[2]);
// 2-bit signed quantizes to {-1, -0.5-ish, 0, 0.5-ish}; tolerance
// is the full step.
assert!((back[3] - -1.0).abs() < 2.0 * tol2, "w got {}", back[3]);
}
#[test]
fn normpacked64_roundtrip() {
let v = Vec128::from_f32x4(0.5, -0.25, 0.75, 0.5);
let packed = pack_normpacked64(v);
let back = unpack_normpacked64(packed).as_f32x4();
let tol20 = tol_normalized(20);
let tol4 = tol_normalized(4);
assert!((back[0] - 0.5).abs() < tol20, "x got {}", back[0]);
assert!((back[1] - -0.25).abs() < tol20, "y got {}", back[1]);
assert!((back[2] - 0.75).abs() < tol20, "z got {}", back[2]);
assert!((back[3] - 0.5).abs() < tol4, "w got {}", back[3]);
}
#[test]
fn float16_2_roundtrip_normals() {
// Half has ~3 decimal digits of precision. Pick values that
// survive conversion cleanly: powers of 2 + simple fractions.
let v = Vec128::from_f32x4(1.0, -2.5, 0.0, 0.0);
let packed = pack_float16_2(v);
let back = unpack_float16_2(packed).as_f32x4();
assert_eq!(back[0], 1.0);
assert_eq!(back[1], -2.5);
assert_eq!(back[2], 0.0);
assert_eq!(back[3], 1.0);
}
#[test]
fn float16_4_roundtrip_normals() {
let v = Vec128::from_f32x4(0.5, -3.0, 16.0, -0.125);
let packed = pack_float16_4(v);
let back = unpack_float16_4(packed).as_f32x4();
assert_eq!(back[0], 0.5);
assert_eq!(back[1], -3.0);
assert_eq!(back[2], 16.0);
assert_eq!(back[3], -0.125);
}
#[test]
fn float16_handles_zero_and_infinity() {
// Zero should survive.
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(0.0)), 0.0);
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(-0.0)).to_bits(), (-0.0f32).to_bits());
// +inf.
let inf_back = f16_bits_to_f32(f32_to_f16_bits(f32::INFINITY));
assert!(inf_back.is_infinite() && inf_back > 0.0);
// Overflow → +inf.
let overflow_back = f16_bits_to_f32(f32_to_f16_bits(65536.0));
assert!(overflow_back.is_infinite());
}
#[test]
fn pack_type_enum_maps_canary_values() {
use D3dPackType::*;
assert!(matches!(D3dPackType::from_immediate(0), D3dColor));
assert!(matches!(D3dPackType::from_immediate(1), NormShort2));
assert!(matches!(D3dPackType::from_immediate(2), NormPacked32));
assert!(matches!(D3dPackType::from_immediate(3), Float16_2));
assert!(matches!(D3dPackType::from_immediate(4), NormShort4));
assert!(matches!(D3dPackType::from_immediate(5), Float16_4));
assert!(matches!(D3dPackType::from_immediate(6), NormPacked64));
assert!(matches!(D3dPackType::from_immediate(7), Other(7)));
}
}