xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches
Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
918
crates/xenia-cpu/src/vmx.rs
Normal file
918
crates/xenia-cpu/src/vmx.rs
Normal file
@@ -0,0 +1,918 @@
|
||||
//! VMX / AltiVec helper routines shared by the interpreter's 150+ vector
|
||||
//! opcode handlers.
|
||||
//!
|
||||
//! Big-endian lane indexing throughout: `Vec128::bytes[0]` is the most
|
||||
//! significant byte, which corresponds to PowerPC lane 0. Operations that
|
||||
//! care about "even" vs "odd" lanes follow the PPC convention (lane 0 = most
|
||||
//! significant = "even" for multiply-even/odd purposes).
|
||||
|
||||
use xenia_memory::MemoryAccess;
|
||||
use xenia_types::Vec128;
|
||||
|
||||
// ─── Lane accessors ────────────────────────────────────────────────────────
|
||||
|
||||
#[inline] pub fn as_i8x16(v: Vec128) -> [i8; 16] {
|
||||
let b = v.as_bytes();
|
||||
let mut r = [0i8; 16];
|
||||
for i in 0..16 { r[i] = b[i] as i8; }
|
||||
r
|
||||
}
|
||||
|
||||
#[inline] pub fn as_i16x8(v: Vec128) -> [i16; 8] {
|
||||
let u = v.as_u16x8();
|
||||
[u[0] as i16, u[1] as i16, u[2] as i16, u[3] as i16,
|
||||
u[4] as i16, u[5] as i16, u[6] as i16, u[7] as i16]
|
||||
}
|
||||
|
||||
#[inline] pub fn as_i32x4(v: Vec128) -> [i32; 4] {
|
||||
let u = v.as_u32x4();
|
||||
[u[0] as i32, u[1] as i32, u[2] as i32, u[3] as i32]
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i8x16(r: [i8; 16]) -> Vec128 {
|
||||
let mut b = [0u8; 16];
|
||||
for i in 0..16 { b[i] = r[i] as u8; }
|
||||
Vec128::from_bytes(b)
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i16x8(r: [i16; 8]) -> Vec128 {
|
||||
Vec128::from_u16x8_array([
|
||||
r[0] as u16, r[1] as u16, r[2] as u16, r[3] as u16,
|
||||
r[4] as u16, r[5] as u16, r[6] as u16, r[7] as u16,
|
||||
])
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i32x4(r: [i32; 4]) -> Vec128 {
|
||||
Vec128::from_u32x4_array([r[0] as u32, r[1] as u32, r[2] as u32, r[3] as u32])
|
||||
}
|
||||
|
||||
// ─── Saturation helpers ────────────────────────────────────────────────────
|
||||
// Each returns (clamped_value, saturated_flag). Handlers OR the flags together
|
||||
// and call `ctx.set_vscr_sat(true)` once per instruction.
|
||||
|
||||
#[inline] pub fn sat_add_u8(a: u8, b: u8) -> (u8, bool) {
|
||||
let s = a as u16 + b as u16;
|
||||
if s > u8::MAX as u16 { (u8::MAX, true) } else { (s as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u8(a: u8, b: u8) -> (u8, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i8(a: i8, b: i8) -> (i8, bool) {
|
||||
let s = a as i16 + b as i16;
|
||||
if s > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if s < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (s as i8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i8(a: i8, b: i8) -> (i8, bool) {
|
||||
let s = a as i16 - b as i16;
|
||||
if s > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if s < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (s as i8, false) }
|
||||
}
|
||||
|
||||
#[inline] pub fn sat_add_u16(a: u16, b: u16) -> (u16, bool) {
|
||||
let s = a as u32 + b as u32;
|
||||
if s > u16::MAX as u32 { (u16::MAX, true) } else { (s as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u16(a: u16, b: u16) -> (u16, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i16(a: i16, b: i16) -> (i16, bool) {
|
||||
let s = a as i32 + b as i32;
|
||||
if s > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if s < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (s as i16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i16(a: i16, b: i16) -> (i16, bool) {
|
||||
let s = a as i32 - b as i32;
|
||||
if s > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if s < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (s as i16, false) }
|
||||
}
|
||||
|
||||
#[inline] pub fn sat_add_u32(a: u32, b: u32) -> (u32, bool) {
|
||||
let s = a as u64 + b as u64;
|
||||
if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u32(a: u32, b: u32) -> (u32, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i32(a: i32, b: i32) -> (i32, bool) {
|
||||
let s = a as i64 + b as i64;
|
||||
if s > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if s < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (s as i32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i32(a: i32, b: i32) -> (i32, bool) {
|
||||
let s = a as i64 - b as i64;
|
||||
if s > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if s < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (s as i32, false) }
|
||||
}
|
||||
|
||||
// Pack-with-saturation helpers — clamp a wider integer to the narrower type.
|
||||
#[inline] pub fn sat_i16_to_i8(v: i16) -> (i8, bool) {
|
||||
if v > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if v < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (v as i8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i16_to_u8(v: i16) -> (u8, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u8::MAX as i16 { (u8::MAX, true) }
|
||||
else { (v as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_u16_to_u8(v: u16) -> (u8, bool) {
|
||||
if v > u8::MAX as u16 { (u8::MAX, true) } else { (v as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i32_to_i16(v: i32) -> (i16, bool) {
|
||||
if v > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if v < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (v as i16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i32_to_u16(v: i32) -> (u16, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u16::MAX as i32 { (u16::MAX, true) }
|
||||
else { (v as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_u32_to_u16(v: u32) -> (u16, bool) {
|
||||
if v > u16::MAX as u32 { (u16::MAX, true) } else { (v as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i64_to_i32(v: i64) -> (i32, bool) {
|
||||
if v > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if v < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (v as i32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i64_to_u32(v: i64) -> (u32, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u32::MAX as i64 { (u32::MAX, true) }
|
||||
else { (v as u32, false) }
|
||||
}
|
||||
|
||||
// ─── Averages ──────────────────────────────────────────────────────────────
|
||||
// PPC avg is rounded up: (a + b + 1) / 2.
|
||||
#[inline] pub fn avg_u8(a: u8, b: u8) -> u8 {
|
||||
((a as u16 + b as u16 + 1) >> 1) as u8
|
||||
}
|
||||
#[inline] pub fn avg_u16(a: u16, b: u16) -> u16 {
|
||||
((a as u32 + b as u32 + 1) >> 1) as u16
|
||||
}
|
||||
#[inline] pub fn avg_u32(a: u32, b: u32) -> u32 {
|
||||
((a as u64 + b as u64 + 1) >> 1) as u32
|
||||
}
|
||||
#[inline] pub fn avg_i8(a: i8, b: i8) -> i8 {
|
||||
((a as i32 + b as i32 + 1) >> 1) as i8
|
||||
}
|
||||
#[inline] pub fn avg_i16(a: i16, b: i16) -> i16 {
|
||||
((a as i32 + b as i32 + 1) >> 1) as i16
|
||||
}
|
||||
#[inline] pub fn avg_i32(a: i32, b: i32) -> i32 {
|
||||
((a as i64 + b as i64 + 1) >> 1) as i32
|
||||
}
|
||||
|
||||
// ─── NaN-aware f32 min/max for vmaxfp / vminfp ────────────────────────────
|
||||
//
|
||||
// Altivec PEM: "If either element of vA or vB is a NaN, the corresponding
|
||||
// element of vD is set to the quiet NaN form of that NaN". Rust's `>` / `<`
|
||||
// comparison with NaN always returns false, so `if a > b { a } else { b }`
|
||||
// would silently pick `b` whenever `a` is NaN — losing NaN propagation.
|
||||
|
||||
#[inline]
|
||||
pub fn max_nan(a: f32, b: f32) -> f32 {
|
||||
if a.is_nan() { quiet_nan(a) }
|
||||
else if b.is_nan() { quiet_nan(b) }
|
||||
else if a > b { a } else { b }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn min_nan(a: f32, b: f32) -> f32 {
|
||||
if a.is_nan() { quiet_nan(a) }
|
||||
else if b.is_nan() { quiet_nan(b) }
|
||||
else if a < b { a } else { b }
|
||||
}
|
||||
|
||||
/// Convert an SNaN to QNaN by setting the high mantissa bit. A QNaN is
|
||||
/// returned unchanged.
|
||||
#[inline]
|
||||
pub fn quiet_nan(x: f32) -> f32 {
|
||||
if !x.is_nan() { return x; }
|
||||
f32::from_bits(x.to_bits() | 0x0040_0000)
|
||||
}
|
||||
|
||||
/// Flush a subnormal f32 to ±0 (preserving the sign). Used by vmaddfp family,
|
||||
/// vctsxs / vctuxs, and any instruction whose AltiVec definition specifies
|
||||
/// input-side denormal flushing regardless of VSCR[NJ].
|
||||
#[inline]
|
||||
pub fn flush_denorm(x: f32) -> f32 {
|
||||
if x.is_subnormal() {
|
||||
if x.is_sign_negative() { -0.0 } else { 0.0 }
|
||||
} else {
|
||||
x
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Float ⇄ fixed-point conversions (scaled by 2^scale_bits) ─────────────
|
||||
//
|
||||
// vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
|
||||
#[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
|
||||
if x.is_nan() { return (0, true); }
|
||||
let x = flush_denorm(x);
|
||||
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
|
||||
if scaled >= i32::MAX as f64 { return (i32::MAX, true); }
|
||||
if scaled <= i32::MIN as f64 { return (i32::MIN, true); }
|
||||
(scaled.trunc() as i32, false)
|
||||
}
|
||||
#[inline] pub fn cvt_f32_to_u32_sat(x: f32, scale_bits: u32) -> (u32, bool) {
|
||||
if x.is_nan() { return (0, true); }
|
||||
let x = flush_denorm(x);
|
||||
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
|
||||
if scaled < 0.0 { return (0, true); }
|
||||
if scaled > u32::MAX as f64 { return (u32::MAX, true); }
|
||||
(scaled.trunc() as u32, false)
|
||||
}
|
||||
#[inline] pub fn cvt_i32_to_f32(v: i32, scale_bits: u32) -> f32 {
|
||||
(v as f64 / (1u64 << scale_bits) as f64) as f32
|
||||
}
|
||||
#[inline] pub fn cvt_u32_to_f32(v: u32, scale_bits: u32) -> f32 {
|
||||
(v as f64 / (1u64 << scale_bits) as f64) as f32
|
||||
}
|
||||
|
||||
// ─── Unaligned vector load/store ──────────────────────────────────────────
|
||||
//
|
||||
// lvlx/lvrx and stvlx/stvrx combine to perform any unaligned 16-byte access:
|
||||
// lvlx(EA) | lvrx(EA + 16) loads 16 bytes starting at unaligned EA.
|
||||
// stvlx(EA); stvrx(EA + 16) stores 16 bytes starting at unaligned EA.
|
||||
//
|
||||
// Semantics per the AltiVec manual (and xenia-canary ppc_emit_memory.cc):
|
||||
// lvlx: shift = EA & 0xF, n = 16 - shift. Loads mem[EA..EA+n] into
|
||||
// lanes VR[0..n], zeros VR[n..16].
|
||||
// lvrx: shift = EA & 0xF. If shift == 0, VR = 0. Otherwise loads
|
||||
// mem[EA-shift..EA] into lanes VR[16-shift..16], zeros VR[0..16-shift].
|
||||
// stvlx / stvrx are the symmetric stores.
|
||||
//
|
||||
// `Vec128::bytes[0]` is the most significant byte (PPC lane 0 in BE view).
|
||||
|
||||
pub fn load_vector_left(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
let n = 16 - shift;
|
||||
let mut bytes = [0u8; 16];
|
||||
for i in 0..n {
|
||||
bytes[i] = mem.read_u8(ea.wrapping_add(i as u32));
|
||||
}
|
||||
Vec128::from_bytes(bytes)
|
||||
}
|
||||
|
||||
pub fn load_vector_right(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
if shift == 0 { return Vec128::ZERO; }
|
||||
let base = ea & !0xFu32;
|
||||
let mut bytes = [0u8; 16];
|
||||
for i in 0..shift {
|
||||
bytes[16 - shift + i] = mem.read_u8(base.wrapping_add(i as u32));
|
||||
}
|
||||
Vec128::from_bytes(bytes)
|
||||
}
|
||||
|
||||
pub fn store_vector_left(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
let n = 16 - shift;
|
||||
let b = v.as_bytes();
|
||||
for i in 0..n {
|
||||
mem.write_u8(ea.wrapping_add(i as u32), b[i]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
if shift == 0 { return; }
|
||||
let base = ea & !0xFu32;
|
||||
let b = v.as_bytes();
|
||||
for i in 0..shift {
|
||||
mem.write_u8(base.wrapping_add(i as u32), b[16 - shift + i]);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ─────────────────────────
|
||||
// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel.
|
||||
// vupkhpx / vupklpx reverse the operation.
|
||||
//
|
||||
// Format: input 32-bit word holds
|
||||
// bits 0-6: unused (0)
|
||||
// bit 7: alpha-select (→ bit 15 of output)
|
||||
// bits 8-15: R (top 5 bits kept)
|
||||
// bits 16-23: G (top 5 bits kept)
|
||||
// bits 24-31: B (top 5 bits kept)
|
||||
// Output 16-bit word:
|
||||
// bit 15: A (from input bit 7)
|
||||
// bits 10-14: R
|
||||
// bits 5-9: G
|
||||
// bits 0-4: B
|
||||
|
||||
#[inline] pub fn pack_pixel_555(input: u32) -> u16 {
|
||||
let a = (input >> 7) & 0x1;
|
||||
let r = (input >> 8) & 0xFF;
|
||||
let g = (input >> 16) & 0xFF;
|
||||
let b = (input >> 24) & 0xFF;
|
||||
((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16
|
||||
}
|
||||
|
||||
#[inline] pub fn unpack_pixel_555(input: u16) -> u32 {
|
||||
let input = input as u32;
|
||||
let a = (input >> 15) & 0x1;
|
||||
let r = (input >> 10) & 0x1F;
|
||||
let g = (input >> 5) & 0x1F;
|
||||
let b = input & 0x1F;
|
||||
// Sign-extend A and replicate 5-bit RGB into the top of each byte.
|
||||
let a8 = if a != 0 { 0xFFu32 } else { 0 };
|
||||
let r8 = (r << 3) | (r >> 2);
|
||||
let g8 = (g << 3) | (g >> 2);
|
||||
let b8 = (b << 3) | (b >> 2);
|
||||
(a8 << 24) | (r8 << 16) | (g8 << 8) | b8
|
||||
}
|
||||
|
||||
// ─── VMX128 D3D pack/unpack dispatch ──────────────────────────────────────
|
||||
// `vpkd3d128` / `vupkd3d128` encode a small enum in the instruction word
|
||||
// (VX128_4 immediate field). The exact enum lives in canary's
|
||||
// ppc_emit_altivec.cc under PACK_TYPE_*; titles usually touch D3DCOLOR
|
||||
// (type 0) and a handful of texture-coordinate variants.
|
||||
//
|
||||
// Rather than risk getting a rarely-used sub-case wrong, we implement the
|
||||
// common types and fall back to a warning + pass-through for unknown types.
|
||||
// Returning the VB register value unchanged is always preferable to emitting
|
||||
// StepResult::Unimplemented because it keeps the interpreter running.
|
||||
|
||||
/// Pack-type encoding of `vpkd3d128` / `vupkd3d128`.
|
||||
///
|
||||
/// The immediate field lives at PPC bits 16-22 (VX128_3/4 IMM, 7 bits).
|
||||
/// Canary decodes `type = IMM >> 2` (top 5 bits) and `pack = IMM & 0x3`
|
||||
/// (low 2 bits, used only by `vpkd3d128` to select output-slot layout).
|
||||
/// Valid `type` values are 0..=6 per `ppc_emit_altivec.cc:2095-2118`:
|
||||
///
|
||||
/// | id | canary name | format |
|
||||
/// |----|-------------------|---------------------------------------|
|
||||
/// | 0 | VPACK_D3DCOLOR | 4 f32 [0,1] ↔ ARGB8 |
|
||||
/// | 1 | VPACK_NORMSHORT2 | 2 f32 [-1,1] ↔ 2× signed-normalized i16 |
|
||||
/// | 2 | VPACK_NORMPACKED32| 4 f32 [-1,1] ↔ UINT_2101010 (w:2,z:10,y:10,x:10) |
|
||||
/// | 3 | VPACK_FLOAT16_2 | 2 f32 ↔ 2× fp16 |
|
||||
/// | 4 | VPACK_NORMSHORT4 | 4 f32 [-1,1] ↔ 4× signed-normalized i16 |
|
||||
/// | 5 | VPACK_FLOAT16_4 | 4 f32 ↔ 4× fp16 |
|
||||
/// | 6 | VPACK_NORMPACKED64| 4 f32 [-1,1] ↔ ULONG_4202020 (w:4,z:20,y:20,x:20) |
|
||||
///
|
||||
/// Prior (M3-pre) this enum listed made-up "Normal16"/"Normal8"/"UByteN4"
|
||||
/// variants that didn't match canary; the immediate extraction was also
|
||||
/// wrong (LSB-numbered `>>6 & 0x7` instead of MSB-numbered `>>11 & 0x1F`
|
||||
/// against a 7-bit IMM field). M3 fixes both.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum D3dPackType {
|
||||
D3dColor,
|
||||
NormShort2,
|
||||
NormPacked32,
|
||||
Float16_2,
|
||||
NormShort4,
|
||||
Float16_4,
|
||||
NormPacked64,
|
||||
Other(u32),
|
||||
}
|
||||
|
||||
impl D3dPackType {
|
||||
/// Decode the `type` bits extracted from the VX128_3/4 IMM field via
|
||||
/// canary's `IMM >> 2` convention (i.e. the caller has already divided
|
||||
/// out the 2-bit `pack` subfield).
|
||||
pub fn from_immediate(type_bits: u32) -> Self {
|
||||
match type_bits {
|
||||
0 => Self::D3dColor,
|
||||
1 => Self::NormShort2,
|
||||
2 => Self::NormPacked32,
|
||||
3 => Self::Float16_2,
|
||||
4 => Self::NormShort4,
|
||||
5 => Self::Float16_4,
|
||||
6 => Self::NormPacked64,
|
||||
other => Self::Other(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pack an f32x4 vector of [R, G, B, A] in [0.0, 1.0] into a single D3DCOLOR
|
||||
/// value in lane 3 of the output.
|
||||
pub fn pack_d3dcolor(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let to_byte = |x: f32| -> u32 {
|
||||
let c = x.clamp(0.0, 1.0) * 255.0;
|
||||
(c + 0.5) as u32 & 0xFF
|
||||
};
|
||||
// D3DCOLOR is A,R,G,B in that byte order inside a u32.
|
||||
let word = (to_byte(f[3]) << 24) | (to_byte(f[0]) << 16) | (to_byte(f[1]) << 8) | to_byte(f[2]);
|
||||
Vec128::from_u32x4(0, 0, 0, word)
|
||||
}
|
||||
|
||||
/// Unpack a D3DCOLOR value (in lane 3 of the input) into an f32x4 [R, G, B, A].
|
||||
pub fn unpack_d3dcolor(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let a = ((word >> 24) & 0xFF) as f32 / 255.0;
|
||||
let r = ((word >> 16) & 0xFF) as f32 / 255.0;
|
||||
let g = ((word >> 8) & 0xFF) as f32 / 255.0;
|
||||
let b = (word & 0xFF) as f32 / 255.0;
|
||||
Vec128::from_f32x4(r, g, b, a)
|
||||
}
|
||||
|
||||
// ───────────────────────────────────────────────────────────────────────
|
||||
// First-Pixels M3 — pack/unpack for the remaining canary pack types.
|
||||
//
|
||||
// Conventions shared across all helpers:
|
||||
// * Input-to-`unpack_*` (packed data) lives in the *source* lane position
|
||||
// canary's HIR assumes: canonically the 32-bit word is in lane 3 and
|
||||
// the 64-bit value straddles lanes 2-3. We match that so the existing
|
||||
// D3DCOLOR helpers' 3-lane convention is preserved across the whole
|
||||
// pack-type family.
|
||||
// * Output-from-`pack_*` sits in the same lane(s). The caller usually
|
||||
// follows with a permute to move it elsewhere (the VX128_4 `pack`
|
||||
// subfield controls that in `vpkd3d128`).
|
||||
// * Range semantics match canary: normalized types use `max` = (1<<N-1)-1
|
||||
// for signed, clamp before rounding.
|
||||
// ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[inline]
|
||||
fn norm_to_i16(x: f32) -> i16 {
|
||||
let c = x.clamp(-1.0, 1.0) * 32767.0;
|
||||
// Round half away from zero, matching canary's `vcfsx` semantics.
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
r.clamp(-32768, 32767) as i16
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn i16_to_norm(s: i16) -> f32 {
|
||||
(s as f32) / 32767.0
|
||||
}
|
||||
|
||||
/// **NORMSHORT2** — 2 f32s in [-1, 1] → two 16-bit signed-normalized
|
||||
/// shorts packed as `(x << 16) | y` in lane 3 (high 32 bits of the word
|
||||
/// hold X; low 16 hold Y). Output lanes 0..=2 are zero-filled.
|
||||
pub fn pack_normshort2(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = norm_to_i16(f[0]) as u16 as u32;
|
||||
let y = norm_to_i16(f[1]) as u16 as u32;
|
||||
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
|
||||
}
|
||||
|
||||
pub fn unpack_normshort2(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let x = i16_to_norm((word >> 16) as i16);
|
||||
let y = i16_to_norm(word as i16);
|
||||
Vec128::from_f32x4(x, y, 0.0, 1.0)
|
||||
}
|
||||
|
||||
/// **NORMSHORT4** — 4 f32s in [-1, 1] → four 16-bit signed-normalized
|
||||
/// shorts packed across lanes 2-3 (big-endian dword order: X in the
|
||||
/// high word of lane 2, Y low of lane 2, Z high of lane 3, W low of lane
|
||||
/// 3).
|
||||
pub fn pack_normshort4(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = norm_to_i16(f[0]) as u16 as u32;
|
||||
let y = norm_to_i16(f[1]) as u16 as u32;
|
||||
let z = norm_to_i16(f[2]) as u16 as u32;
|
||||
let w = norm_to_i16(f[3]) as u16 as u32;
|
||||
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
|
||||
}
|
||||
|
||||
pub fn unpack_normshort4(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2);
|
||||
let lo = v.u32x4(3);
|
||||
let x = i16_to_norm((hi >> 16) as i16);
|
||||
let y = i16_to_norm(hi as i16);
|
||||
let z = i16_to_norm((lo >> 16) as i16);
|
||||
let w = i16_to_norm(lo as i16);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// **NORMPACKED32** — UINT_2101010 layout, 4 f32s in [-1, 1] packed into
|
||||
/// 32 bits in lane 3. Per canary's comment `2_10_10_10 w_z_y_x`: the
|
||||
/// high 2 bits hold W (signed 2-bit, -2..=1), then Z/Y/X each use 10
|
||||
/// signed-normalized bits.
|
||||
pub fn pack_normpacked32(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
#[inline]
|
||||
fn n10(x: f32) -> u32 {
|
||||
let c = x.clamp(-1.0, 1.0) * 511.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
(r.clamp(-512, 511) as i32 as u32) & 0x3FF
|
||||
}
|
||||
#[inline]
|
||||
fn n2(x: f32) -> u32 {
|
||||
let c = x.clamp(-1.0, 1.0) * 1.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
(r.clamp(-2, 1) as i32 as u32) & 0x3
|
||||
}
|
||||
let x = n10(f[0]);
|
||||
let y = n10(f[1]);
|
||||
let z = n10(f[2]);
|
||||
let w = n2(f[3]);
|
||||
let word = (w << 30) | (z << 20) | (y << 10) | x;
|
||||
Vec128::from_u32x4(0, 0, 0, word)
|
||||
}
|
||||
|
||||
pub fn unpack_normpacked32(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
#[inline]
|
||||
fn u10_to_norm(bits: u32) -> f32 {
|
||||
// Sign-extend the 10-bit field then normalize.
|
||||
let s = ((bits & 0x3FF) as i32) << 22 >> 22;
|
||||
(s as f32) / 511.0
|
||||
}
|
||||
#[inline]
|
||||
fn u2_to_norm(bits: u32) -> f32 {
|
||||
let s = ((bits & 0x3) as i32) << 30 >> 30;
|
||||
(s as f32).clamp(-1.0, 1.0)
|
||||
}
|
||||
let x = u10_to_norm(word);
|
||||
let y = u10_to_norm(word >> 10);
|
||||
let z = u10_to_norm(word >> 20);
|
||||
let w = u2_to_norm(word >> 30);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// **NORMPACKED64** — ULONG_4202020, 4 f32s in [-1, 1] packed into 64
|
||||
/// bits across lanes 2-3. Per canary's comment `4_20_20_20 w_z_y_x`:
|
||||
/// the high 4 bits of the dword hold W (signed 4-bit); the remaining 60
|
||||
/// bits hold 3× 20-bit signed-normalized Z/Y/X. Rare outside very few
|
||||
/// titles (canary notes 54540829).
|
||||
pub fn pack_normpacked64(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
#[inline]
|
||||
fn n20(x: f32) -> u64 {
|
||||
let c = x.clamp(-1.0, 1.0) * 524287.0; // 2^19 - 1
|
||||
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
|
||||
(r.clamp(-524288, 524287) as i64 as u64) & 0xF_FFFF
|
||||
}
|
||||
#[inline]
|
||||
fn n4(x: f32) -> u64 {
|
||||
let c = x.clamp(-1.0, 1.0) * 7.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
|
||||
(r.clamp(-8, 7) as i64 as u64) & 0xF
|
||||
}
|
||||
let x = n20(f[0]);
|
||||
let y = n20(f[1]);
|
||||
let z = n20(f[2]);
|
||||
let w = n4(f[3]);
|
||||
let dw: u64 = (w << 60) | (z << 40) | (y << 20) | x;
|
||||
Vec128::from_u32x4(0, 0, (dw >> 32) as u32, dw as u32)
|
||||
}
|
||||
|
||||
pub fn unpack_normpacked64(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2) as u64;
|
||||
let lo = v.u32x4(3) as u64;
|
||||
let dw = (hi << 32) | lo;
|
||||
#[inline]
|
||||
fn u20_to_norm(bits: u64) -> f32 {
|
||||
let s = ((bits & 0xF_FFFF) as i64) << 44 >> 44;
|
||||
(s as f32) / 524287.0
|
||||
}
|
||||
#[inline]
|
||||
fn u4_to_norm(bits: u64) -> f32 {
|
||||
let s = ((bits & 0xF) as i64) << 60 >> 60;
|
||||
(s as f32) / 7.0
|
||||
}
|
||||
let x = u20_to_norm(dw);
|
||||
let y = u20_to_norm(dw >> 20);
|
||||
let z = u20_to_norm(dw >> 40);
|
||||
let w = u4_to_norm(dw >> 60);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// IEEE 754 half-precision float pack/unpack — used by both FLOAT16_2
|
||||
/// and FLOAT16_4. No FMA quirks involved; we go via `f32::to_bits` and
|
||||
/// manual bit-twiddling (the stable-Rust `f16` type isn't available
|
||||
/// yet).
|
||||
#[inline]
|
||||
fn f32_to_f16_bits(f: f32) -> u16 {
|
||||
let bits = f.to_bits();
|
||||
let sign = ((bits >> 31) & 0x1) as u16;
|
||||
let exp = ((bits >> 23) & 0xFF) as i32;
|
||||
let mant = bits & 0x7FFFFF;
|
||||
// Handle the easy cases first.
|
||||
if exp == 0xFF {
|
||||
// NaN or infinity.
|
||||
let half_exp = 0x1F;
|
||||
let half_mant = if mant != 0 { 0x200 } else { 0 }; // quiet NaN / zero mantissa for Inf
|
||||
return (sign << 15) | (half_exp << 10) | half_mant;
|
||||
}
|
||||
let unbiased_exp = exp - 127;
|
||||
if unbiased_exp >= 16 {
|
||||
// Overflow → infinity.
|
||||
return (sign << 15) | (0x1F << 10);
|
||||
}
|
||||
if unbiased_exp <= -15 {
|
||||
// Denormal or zero. Compute the shift and subnormal mantissa;
|
||||
// anything too small flushes to signed zero.
|
||||
if unbiased_exp < -24 {
|
||||
return sign << 15;
|
||||
}
|
||||
let shift = -14 - unbiased_exp as i32; // amount to shift the implicit-1'd mantissa
|
||||
let full_mant = 0x800000 | mant; // 24 bits with implicit leading 1
|
||||
let half_mant = (full_mant >> (shift + 13)) as u16;
|
||||
return (sign << 15) | half_mant;
|
||||
}
|
||||
let half_exp = ((unbiased_exp + 15) as u16) & 0x1F;
|
||||
let half_mant = (mant >> 13) as u16;
|
||||
(sign << 15) | (half_exp << 10) | half_mant
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn f16_bits_to_f32(h: u16) -> f32 {
|
||||
let sign = ((h >> 15) & 0x1) as u32;
|
||||
let exp = ((h >> 10) & 0x1F) as i32;
|
||||
let mant = (h & 0x3FF) as u32;
|
||||
let bits = if exp == 0x1F {
|
||||
// NaN or infinity.
|
||||
let f32_exp = 0xFFu32;
|
||||
let f32_mant = if mant != 0 { 0x400000 } else { 0 };
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
} else if exp == 0 && mant == 0 {
|
||||
// Signed zero.
|
||||
sign << 31
|
||||
} else if exp == 0 {
|
||||
// Subnormal — renormalize.
|
||||
let mut e = -14i32;
|
||||
let mut m = mant;
|
||||
while (m & 0x400) == 0 {
|
||||
m <<= 1;
|
||||
e -= 1;
|
||||
}
|
||||
let f32_exp = ((e + 127) as u32) & 0xFF;
|
||||
let f32_mant = (m & 0x3FF) << 13;
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
} else {
|
||||
let f32_exp = ((exp - 15 + 127) as u32) & 0xFF;
|
||||
let f32_mant = mant << 13;
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
};
|
||||
f32::from_bits(bits)
|
||||
}
|
||||
|
||||
/// **FLOAT16_2** — two 32-bit floats → two half-floats packed into one
|
||||
/// 32-bit word (X in high 16 bits of lane 3, Y in low 16).
|
||||
pub fn pack_float16_2(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = f32_to_f16_bits(f[0]) as u32;
|
||||
let y = f32_to_f16_bits(f[1]) as u32;
|
||||
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
|
||||
}
|
||||
|
||||
pub fn unpack_float16_2(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let x = f16_bits_to_f32((word >> 16) as u16);
|
||||
let y = f16_bits_to_f32(word as u16);
|
||||
Vec128::from_f32x4(x, y, 0.0, 1.0)
|
||||
}
|
||||
|
||||
/// **FLOAT16_4** — four 32-bit floats → four half-floats packed across
|
||||
/// 64 bits (lanes 2-3).
|
||||
pub fn pack_float16_4(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = f32_to_f16_bits(f[0]) as u32;
|
||||
let y = f32_to_f16_bits(f[1]) as u32;
|
||||
let z = f32_to_f16_bits(f[2]) as u32;
|
||||
let w = f32_to_f16_bits(f[3]) as u32;
|
||||
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
|
||||
}
|
||||
|
||||
pub fn unpack_float16_4(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2);
|
||||
let lo = v.u32x4(3);
|
||||
let x = f16_bits_to_f32((hi >> 16) as u16);
|
||||
let y = f16_bits_to_f32(hi as u16);
|
||||
let z = f16_bits_to_f32((lo >> 16) as u16);
|
||||
let w = f16_bits_to_f32(lo as u16);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
// ─── CR6 helpers used by integer compares ─────────────────────────────────
|
||||
// vcmp*. (record-form) updates CR6 in a compressed form:
|
||||
// CR6 = {all-true, 0, all-false, 0}
|
||||
// where each bit reflects the per-lane mask across the whole register.
|
||||
|
||||
#[inline] pub fn cr6_flags_from_mask(mask: Vec128) -> (bool, bool) {
|
||||
let b = mask.as_bytes();
|
||||
let mut any_set = false;
|
||||
let mut any_clear = false;
|
||||
for &byte in b.iter() {
|
||||
if byte != 0 { any_set = true; }
|
||||
if byte != 0xFF { any_clear = true; }
|
||||
}
|
||||
let all_true = !any_clear;
|
||||
let all_false = !any_set;
|
||||
(all_true, all_false)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
struct TestMem { data: Box<[Cell<u8>]> }
|
||||
impl TestMem {
|
||||
fn new(size: usize) -> Self {
|
||||
Self { data: (0..size).map(|_| Cell::new(0)).collect() }
|
||||
}
|
||||
}
|
||||
impl MemoryAccess for TestMem {
|
||||
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
u16::from_be_bytes([self.data[a as usize].get(), self.data[a as usize + 1].get()])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
let a = a as usize;
|
||||
u32::from_be_bytes([
|
||||
self.data[a].get(), self.data[a+1].get(),
|
||||
self.data[a+2].get(), self.data[a+3].get(),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
let a = a as usize;
|
||||
u64::from_be_bytes([
|
||||
self.data[a].get(), self.data[a+1].get(),
|
||||
self.data[a+2].get(), self.data[a+3].get(),
|
||||
self.data[a+4].get(), self.data[a+5].get(),
|
||||
self.data[a+6].get(), self.data[a+7].get(),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let b = v.to_be_bytes();
|
||||
self.data[a as usize].set(b[0]);
|
||||
self.data[a as usize + 1].set(b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let b = v.to_be_bytes(); let a = a as usize;
|
||||
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let b = v.to_be_bytes(); let a = a as usize;
|
||||
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
|
||||
}
|
||||
fn translate(&self, _a: u32) -> Option<*const u8> { None }
|
||||
fn translate_mut(&self, _a: u32) -> Option<*mut u8> { None }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvlx_lvrx_round_trip() {
|
||||
let m = TestMem::new(0x40);
|
||||
for i in 0..0x30 { m.data[i].set((i as u8).wrapping_add(0x10)); }
|
||||
// Unaligned load from 0x13 should combine lvlx(0x13) | lvrx(0x23).
|
||||
let lo = load_vector_left(&m, 0x13);
|
||||
let hi = load_vector_right(&m, 0x23);
|
||||
let mut combined = [0u8; 16];
|
||||
let lob = lo.as_bytes();
|
||||
let hib = hi.as_bytes();
|
||||
for i in 0..16 { combined[i] = lob[i] | hib[i]; }
|
||||
for i in 0..16 {
|
||||
assert_eq!(combined[i], m.data[0x13 + i].get(), "lane {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvlx_aligned_is_full_load() {
|
||||
let m = TestMem::new(0x20);
|
||||
for i in 0..0x20 { m.data[i].set(i as u8); }
|
||||
let v = load_vector_left(&m, 0x10);
|
||||
let b = v.as_bytes();
|
||||
for i in 0..16 { assert_eq!(b[i], 0x10 + i as u8); }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvrx_aligned_is_zero() {
|
||||
let m = TestMem::new(0x20);
|
||||
let v = load_vector_right(&m, 0x10);
|
||||
assert_eq!(v.as_bytes(), [0u8; 16]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sat_add_signed_overflow() {
|
||||
assert_eq!(sat_add_i8(120, 10), (127, true));
|
||||
assert_eq!(sat_add_i8(-120, -10), (-128, true));
|
||||
assert_eq!(sat_add_i8(1, 2), (3, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sat_sub_unsigned_underflow() {
|
||||
assert_eq!(sat_sub_u8(5, 10), (0, true));
|
||||
assert_eq!(sat_sub_u8(10, 5), (5, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_unpack_pixel_555() {
|
||||
let encoded = pack_pixel_555(0x80_F8_F8_F8);
|
||||
assert_eq!(encoded & 0x8000, 0x8000);
|
||||
let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F);
|
||||
assert_eq!(w & 0xFF000000, 0xFF000000);
|
||||
}
|
||||
|
||||
// ─── First-Pixels M3 pack/unpack roundtrip tests ───
|
||||
|
||||
/// Quantization error tolerance for N-bit signed normalized values.
|
||||
/// `1.0 / ((1 << (bits - 1)) - 1)` is the step size.
|
||||
fn tol_normalized(bits: u32) -> f32 {
|
||||
1.0 / ((1u32 << (bits - 1)) - 1) as f32
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normshort2_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.75, 0.0, 0.0);
|
||||
let packed = pack_normshort2(v);
|
||||
let back = unpack_normshort2(packed).as_f32x4();
|
||||
let tol = tol_normalized(16);
|
||||
assert!((back[0] - 0.5).abs() < tol, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.75).abs() < tol, "y got {}", back[1]);
|
||||
assert_eq!(back[2], 0.0);
|
||||
assert_eq!(back[3], 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normshort4_roundtrip_extremes() {
|
||||
let v = Vec128::from_f32x4(1.0, -1.0, 0.0, 0.25);
|
||||
let packed = pack_normshort4(v);
|
||||
let back = unpack_normshort4(packed).as_f32x4();
|
||||
let tol = tol_normalized(16);
|
||||
assert!((back[0] - 1.0).abs() < tol);
|
||||
assert!((back[1] - -1.0).abs() < tol);
|
||||
assert!((back[2] - 0.0).abs() < tol);
|
||||
assert!((back[3] - 0.25).abs() < tol);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normpacked32_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.5, 0.9, -1.0);
|
||||
let packed = pack_normpacked32(v);
|
||||
let back = unpack_normpacked32(packed).as_f32x4();
|
||||
let tol10 = tol_normalized(10);
|
||||
let tol2 = tol_normalized(2);
|
||||
assert!((back[0] - 0.5).abs() < tol10, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.5).abs() < tol10, "y got {}", back[1]);
|
||||
assert!((back[2] - 0.9).abs() < tol10, "z got {}", back[2]);
|
||||
// 2-bit signed quantizes to {-1, -0.5-ish, 0, 0.5-ish}; tolerance
|
||||
// is the full step.
|
||||
assert!((back[3] - -1.0).abs() < 2.0 * tol2, "w got {}", back[3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normpacked64_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.25, 0.75, 0.5);
|
||||
let packed = pack_normpacked64(v);
|
||||
let back = unpack_normpacked64(packed).as_f32x4();
|
||||
let tol20 = tol_normalized(20);
|
||||
let tol4 = tol_normalized(4);
|
||||
assert!((back[0] - 0.5).abs() < tol20, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.25).abs() < tol20, "y got {}", back[1]);
|
||||
assert!((back[2] - 0.75).abs() < tol20, "z got {}", back[2]);
|
||||
assert!((back[3] - 0.5).abs() < tol4, "w got {}", back[3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_2_roundtrip_normals() {
|
||||
// Half has ~3 decimal digits of precision. Pick values that
|
||||
// survive conversion cleanly: powers of 2 + simple fractions.
|
||||
let v = Vec128::from_f32x4(1.0, -2.5, 0.0, 0.0);
|
||||
let packed = pack_float16_2(v);
|
||||
let back = unpack_float16_2(packed).as_f32x4();
|
||||
assert_eq!(back[0], 1.0);
|
||||
assert_eq!(back[1], -2.5);
|
||||
assert_eq!(back[2], 0.0);
|
||||
assert_eq!(back[3], 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_4_roundtrip_normals() {
|
||||
let v = Vec128::from_f32x4(0.5, -3.0, 16.0, -0.125);
|
||||
let packed = pack_float16_4(v);
|
||||
let back = unpack_float16_4(packed).as_f32x4();
|
||||
assert_eq!(back[0], 0.5);
|
||||
assert_eq!(back[1], -3.0);
|
||||
assert_eq!(back[2], 16.0);
|
||||
assert_eq!(back[3], -0.125);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_handles_zero_and_infinity() {
|
||||
// Zero should survive.
|
||||
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(0.0)), 0.0);
|
||||
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(-0.0)).to_bits(), (-0.0f32).to_bits());
|
||||
// +inf.
|
||||
let inf_back = f16_bits_to_f32(f32_to_f16_bits(f32::INFINITY));
|
||||
assert!(inf_back.is_infinite() && inf_back > 0.0);
|
||||
// Overflow → +inf.
|
||||
let overflow_back = f16_bits_to_f32(f32_to_f16_bits(65536.0));
|
||||
assert!(overflow_back.is_infinite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_type_enum_maps_canary_values() {
|
||||
use D3dPackType::*;
|
||||
assert!(matches!(D3dPackType::from_immediate(0), D3dColor));
|
||||
assert!(matches!(D3dPackType::from_immediate(1), NormShort2));
|
||||
assert!(matches!(D3dPackType::from_immediate(2), NormPacked32));
|
||||
assert!(matches!(D3dPackType::from_immediate(3), Float16_2));
|
||||
assert!(matches!(D3dPackType::from_immediate(4), NormShort4));
|
||||
assert!(matches!(D3dPackType::from_immediate(5), Float16_4));
|
||||
assert!(matches!(D3dPackType::from_immediate(6), NormPacked64));
|
||||
assert!(matches!(D3dPackType::from_immediate(7), Other(7)));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user