xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches
Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
423
crates/xenia-cpu/src/block_cache.rs
Normal file
423
crates/xenia-cpu/src/block_cache.rs
Normal file
@@ -0,0 +1,423 @@
|
||||
//! Tier-4 perf — basic-block cache for the PPC interpreter.
|
||||
//!
|
||||
//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction
|
||||
//! per slot, indexed by PC. The hot loop still pays the per-instruction
|
||||
//! cost of fetching the raw word, hashing the PC into a slot, and
|
||||
//! comparing tags. For straight-line code — common in the asset/inflate
|
||||
//! loops where Sylpheed boot is currently CPU-bound — the savings of
|
||||
//! batching N decoded instructions per slot lookup are linear in block
|
||||
//! length.
|
||||
//!
|
||||
//! ## Shape
|
||||
//!
|
||||
//! A `DecodedBlock` is a contiguous run of decoded instructions starting
|
||||
//! at `start_pc`, ending at the first *block terminator* (any branch,
|
||||
//! `sc`, trap, or `Invalid`) or at one of two safety limits:
|
||||
//!
|
||||
//! - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost.
|
||||
//! - 4 KiB page boundary stop. A block is fully contained inside a
|
||||
//! single 4 KiB guest page; that means `mem.page_version(start_pc)`
|
||||
//! is sufficient to detect any code-page rewrite that should
|
||||
//! invalidate the block. Without this rule the cache would have to
|
||||
//! walk every spanned page on every hit, which would erase the win.
|
||||
//!
|
||||
//! ## Invalidation
|
||||
//!
|
||||
//! Each block stamps the page version at build time. On lookup, if
|
||||
//! `mem.page_version(start_pc)` differs from `block.page_version`, the
|
||||
//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at
|
||||
//! block granularity.
|
||||
//!
|
||||
//! ## Debugger semantics
|
||||
//!
|
||||
//! Block dispatch is **opt-in** by the caller. The hot loop in
|
||||
//! `xenia-app/src/main.rs` selects the per-instruction path whenever
|
||||
//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set.
|
||||
//! That's how single-step, breakpoints, in-memory trace, instruction
|
||||
//! trace, and branch trace continue to observe every PC: the block
|
||||
//! cache simply never runs in those modes.
|
||||
|
||||
use crate::decoder::{decode, DecodedInstr};
|
||||
use xenia_memory::MemoryAccess;
|
||||
|
||||
/// Direct-mapped block-cache slot count. Same shape as
|
||||
/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the
|
||||
/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the
|
||||
/// slot collision rate is negligible.
|
||||
const BLOCK_CACHE_SIZE: usize = 1 << 16;
|
||||
const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32;
|
||||
|
||||
/// Hard cap on instructions per block. Keeps the worst-case memory
|
||||
/// footprint bounded and limits the rebuild cost when a code page
|
||||
/// gets bumped. 32 instructions is generous for most basic blocks
|
||||
/// (real-world average across Sylpheed boot is ~6 between branches).
|
||||
pub const MAX_BLOCK_INSTRS: usize = 32;
|
||||
|
||||
/// Guest page size — duplicated here to avoid pulling
|
||||
/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync
|
||||
/// with the memory crate. Both refer to the architectural PowerPC 4 KiB
|
||||
/// page granule, so this constant is locked.
|
||||
const GUEST_PAGE_SIZE: u32 = 4096;
|
||||
const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1);
|
||||
|
||||
/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock`
|
||||
/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and
|
||||
/// stays valid until the next `lookup_or_build` on the same slot.
|
||||
#[derive(Debug)]
|
||||
pub struct DecodedBlock {
|
||||
/// Guest PC at which this block starts. Used as the slot tag.
|
||||
pub start_pc: u32,
|
||||
/// Guest PC immediately after the last instruction in `instrs`.
|
||||
/// Equal to `instrs.last().addr + 4` whether or not the block
|
||||
/// ended on a terminator. Useful for tracing / disassembly.
|
||||
pub end_pc: u32,
|
||||
/// `mem.page_version(start_pc)` at build time. Mismatch on lookup
|
||||
/// invalidates the block. Single value because every block is
|
||||
/// page-bounded by construction.
|
||||
pub page_version: u64,
|
||||
/// Decoded instructions in execution order. Always non-empty after
|
||||
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
|
||||
/// pushes the first decoded word unconditionally).
|
||||
pub instrs: Vec<DecodedInstr>,
|
||||
}
|
||||
|
||||
/// Per-slot status from a `lookup_or_build` probe. Internal only.
|
||||
enum CacheStatus {
|
||||
/// Block at this slot matches `pc` and the page version at build
|
||||
/// time matches `mem.page_version(pc)` — return as-is.
|
||||
Hit,
|
||||
/// Block at this slot matched `pc` but the page version has
|
||||
/// advanced — rebuild and bump `invalidations`.
|
||||
Stale,
|
||||
/// Slot is empty or holds a block keyed at a different `start_pc`.
|
||||
/// Build a fresh block and bump `misses`.
|
||||
Miss,
|
||||
}
|
||||
|
||||
/// Direct-mapped block cache. One instance shared across all HW slots
|
||||
/// (block contents are PC-only and read-only after fill). Not
|
||||
/// thread-safe — owner is the single scheduler thread, same as
|
||||
/// `DecodeCache`.
|
||||
pub struct BlockCache {
|
||||
slots: Box<[Option<Box<DecodedBlock>>]>,
|
||||
hits: u64,
|
||||
misses: u64,
|
||||
invalidations: u64,
|
||||
}
|
||||
|
||||
impl Default for BlockCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockCache {
|
||||
pub fn new() -> Self {
|
||||
// `Option<Box<T>>` is a niche-optimized 8-byte slot; 64 K of
|
||||
// them cost ~512 KiB of cold storage. Live blocks beyond that
|
||||
// sit on the heap.
|
||||
let mut v: Vec<Option<Box<DecodedBlock>>> = Vec::with_capacity(BLOCK_CACHE_SIZE);
|
||||
v.resize_with(BLOCK_CACHE_SIZE, || None);
|
||||
Self {
|
||||
slots: v.into_boxed_slice(),
|
||||
hits: 0,
|
||||
misses: 0,
|
||||
invalidations: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn hits(&self) -> u64 {
|
||||
self.hits
|
||||
}
|
||||
pub fn misses(&self) -> u64 {
|
||||
self.misses
|
||||
}
|
||||
pub fn invalidations(&self) -> u64 {
|
||||
self.invalidations
|
||||
}
|
||||
|
||||
/// Return the cached block starting at `pc`, building it if absent
|
||||
/// or stale. The returned reference is borrowed from the cache and
|
||||
/// stays valid until the next `lookup_or_build` call.
|
||||
pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock {
|
||||
let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize;
|
||||
let cur_pv = mem.page_version(pc);
|
||||
|
||||
// Phase 1: classify the slot. Borrow ends before fill so the
|
||||
// mutable update below doesn't conflict.
|
||||
let status = match &self.slots[idx] {
|
||||
Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit,
|
||||
Some(b) if b.start_pc == pc => CacheStatus::Stale,
|
||||
_ => CacheStatus::Miss,
|
||||
};
|
||||
|
||||
// Phase 2: fill on miss/stale, account.
|
||||
match status {
|
||||
CacheStatus::Hit => {
|
||||
self.hits += 1;
|
||||
}
|
||||
CacheStatus::Stale => {
|
||||
self.invalidations += 1;
|
||||
self.misses += 1;
|
||||
let block = build_block(pc, mem, cur_pv);
|
||||
self.slots[idx] = Some(Box::new(block));
|
||||
}
|
||||
CacheStatus::Miss => {
|
||||
self.misses += 1;
|
||||
let block = build_block(pc, mem, cur_pv);
|
||||
self.slots[idx] = Some(Box::new(block));
|
||||
}
|
||||
}
|
||||
|
||||
// Slot is guaranteed populated at this point — Hit returned a
|
||||
// pre-existing block, Miss/Stale just wrote a new one.
|
||||
self.slots[idx]
|
||||
.as_deref()
|
||||
.expect("block freshly built or hit")
|
||||
}
|
||||
}
|
||||
|
||||
/// Walk forward from `pc`, decoding instructions and collecting them
|
||||
/// into a `DecodedBlock`. The walk stops on the first of:
|
||||
/// - a [`PpcOpcode::terminates_block`] true (the terminator IS
|
||||
/// included as the last instruction),
|
||||
/// - reaching [`MAX_BLOCK_INSTRS`],
|
||||
/// - the next PC would cross a 4 KiB guest page boundary.
|
||||
fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock {
|
||||
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
|
||||
let page_base = start_pc & GUEST_PAGE_MASK;
|
||||
let mut cur = start_pc;
|
||||
|
||||
loop {
|
||||
let raw = mem.read_u32(cur);
|
||||
let decoded = decode(raw, cur);
|
||||
let terminates = decoded.opcode.terminates_block();
|
||||
instrs.push(decoded);
|
||||
|
||||
if terminates {
|
||||
break;
|
||||
}
|
||||
if instrs.len() >= MAX_BLOCK_INSTRS {
|
||||
break;
|
||||
}
|
||||
let next = cur.wrapping_add(4);
|
||||
if (next & GUEST_PAGE_MASK) != page_base {
|
||||
break;
|
||||
}
|
||||
cur = next;
|
||||
}
|
||||
|
||||
let last = instrs.last().expect("build pushes at least one instruction");
|
||||
let end_pc = last.addr.wrapping_add(4);
|
||||
|
||||
DecodedBlock {
|
||||
start_pc,
|
||||
end_pc,
|
||||
page_version,
|
||||
instrs,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::opcode::PpcOpcode;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
/// 64 KiB byte-array memory, big-endian word reads.
|
||||
/// Mirrors `interpreter::tests::TestMem` but lives here so block_cache
|
||||
/// tests don't depend on interpreter internals.
|
||||
struct BlockTestMem {
|
||||
data: Box<[Cell<u8>]>,
|
||||
version_a: u64,
|
||||
version_b: u64,
|
||||
// Address of the page whose version is `version_b` instead of
|
||||
// `version_a`. Used to model an out-of-band page-version bump in
|
||||
// the invalidation test without going through write_*.
|
||||
bumped_page: Cell<Option<u32>>,
|
||||
}
|
||||
|
||||
impl BlockTestMem {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
data: (0..0x10000u32).map(|_| Cell::new(0)).collect(),
|
||||
version_a: 1,
|
||||
version_b: 2,
|
||||
bumped_page: Cell::new(None),
|
||||
}
|
||||
}
|
||||
fn put(&self, addr: u32, raw: u32) {
|
||||
let a = addr as usize;
|
||||
for (i, byte) in raw.to_be_bytes().iter().enumerate() {
|
||||
self.data[a + i].set(*byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MemoryAccess for BlockTestMem {
|
||||
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
let i = a as usize;
|
||||
u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
let i = a as usize;
|
||||
u32::from_be_bytes([
|
||||
self.data[i].get(), self.data[i + 1].get(),
|
||||
self.data[i + 2].get(), self.data[i + 3].get(),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
let i = a as usize;
|
||||
u64::from_be_bytes([
|
||||
self.data[i].get(), self.data[i + 1].get(),
|
||||
self.data[i + 2].get(), self.data[i + 3].get(),
|
||||
self.data[i + 4].get(), self.data[i + 5].get(),
|
||||
self.data[i + 6].get(), self.data[i + 7].get(),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let i = a as usize;
|
||||
let b = v.to_be_bytes();
|
||||
self.data[i].set(b[0]);
|
||||
self.data[i + 1].set(b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let i = a as usize;
|
||||
for (k, byte) in v.to_be_bytes().iter().enumerate() {
|
||||
self.data[i + k].set(*byte);
|
||||
}
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let i = a as usize;
|
||||
for (k, byte) in v.to_be_bytes().iter().enumerate() {
|
||||
self.data[i + k].set(*byte);
|
||||
}
|
||||
}
|
||||
fn translate(&self, _: u32) -> Option<*const u8> { None }
|
||||
fn translate_mut(&self, _: u32) -> Option<*mut u8> { None }
|
||||
fn page_version(&self, addr: u32) -> u64 {
|
||||
if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() {
|
||||
self.version_b
|
||||
} else {
|
||||
self.version_a
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PPC encodings — minimal subset for these tests.
|
||||
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
|
||||
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
|
||||
}
|
||||
fn enc_b_self() -> u32 {
|
||||
// b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0.
|
||||
18 << 26
|
||||
}
|
||||
fn enc_unimplemented() -> u32 {
|
||||
// Use opcode 0 raw = 0; decoder maps to Invalid.
|
||||
0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_built_to_terminator() {
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_addi(3, 3, 1));
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
mem.put(0x10C, enc_b_self()); // terminator
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.start_pc, 0x100);
|
||||
assert_eq!(b.instrs.len(), 4);
|
||||
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx);
|
||||
assert_eq!(b.end_pc, 0x110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_page_boundary() {
|
||||
// Build from 0x1FFC. The next PC (0x2000) is in a different
|
||||
// 4 KiB page — block must contain only the one instruction.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x1FFC, enc_addi(3, 3, 1));
|
||||
mem.put(0x2000, enc_addi(3, 3, 1));
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x1FFC, &mem);
|
||||
assert_eq!(b.instrs.len(), 1);
|
||||
assert_eq!(b.end_pc, 0x2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_max_len() {
|
||||
// 64 consecutive non-terminator instructions on one page —
|
||||
// block must clamp at MAX_BLOCK_INSTRS.
|
||||
let mem = BlockTestMem::new();
|
||||
for i in 0..64u32 {
|
||||
mem.put(0x100 + i * 4, enc_addi(3, 3, 1));
|
||||
}
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS);
|
||||
assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_invalid_opcode() {
|
||||
// Decoder mapping `Invalid` is treated as a block terminator
|
||||
// so the per-instruction Unimplemented path is preserved.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_unimplemented());
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.instrs.len(), 2);
|
||||
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_invalidates_on_page_version_bump() {
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.misses(), 1);
|
||||
assert_eq!(bc.hits(), 0);
|
||||
|
||||
// Same call → hit.
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.hits(), 1);
|
||||
assert_eq!(bc.invalidations(), 0);
|
||||
|
||||
// Bump the page version on the page containing 0x100. Next
|
||||
// lookup must invalidate and rebuild.
|
||||
mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK));
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.invalidations(), 1);
|
||||
assert_eq!(bc.misses(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_hit_returns_same_contents() {
|
||||
// Sanity: cache hit returns a block whose contents reflect the
|
||||
// ORIGINAL instruction stream, even after a non-version-bumping
|
||||
// poke to the underlying bytes. (No real workload would do
|
||||
// this, but it confirms we're returning cached data, not
|
||||
// re-reading.)
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 7));
|
||||
mem.put(0x104, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
|
||||
// Rewrite without bumping version (test-only path).
|
||||
let bytes = enc_addi(3, 3, 99).to_be_bytes();
|
||||
for (i, b) in bytes.iter().enumerate() {
|
||||
mem.data[0x100 + i].set(*b);
|
||||
}
|
||||
let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
|
||||
assert_eq!(first_simm, 7);
|
||||
assert_eq!(cached_simm, 7, "cache must serve original decoded form");
|
||||
}
|
||||
}
|
||||
@@ -29,16 +29,37 @@ pub mod spr {
|
||||
pub const XER: u32 = 1;
|
||||
pub const LR: u32 = 8;
|
||||
pub const CTR: u32 = 9;
|
||||
pub const TBL: u32 = 268;
|
||||
pub const TBU: u32 = 269;
|
||||
pub const DSISR: u32 = 18;
|
||||
pub const DAR: u32 = 19;
|
||||
/// Decrementer (hypervisor-visible, 32-bit down-counter).
|
||||
pub const DEC: u32 = 22;
|
||||
pub const TBL: u32 = 268; // Read (user)
|
||||
pub const TBU: u32 = 269; // Read (user)
|
||||
/// Time-base write (supervisor). Separate SPR number from TBL (268) for
|
||||
/// access-control reasons.
|
||||
pub const TBL_WRITE: u32 = 284;
|
||||
pub const TBU_WRITE: u32 = 285;
|
||||
pub const SPRG0: u32 = 272;
|
||||
pub const SPRG1: u32 = 273;
|
||||
pub const SPRG2: u32 = 274;
|
||||
pub const SPRG3: u32 = 275;
|
||||
pub const VRSAVE: u32 = 256;
|
||||
pub const PVR: u32 = 287;
|
||||
pub const HID0: u32 = 1008;
|
||||
pub const HID1: u32 = 1009;
|
||||
pub const PIR: u32 = 1023;
|
||||
}
|
||||
|
||||
/// LR halt sentinel. When `bclr` returns to this address, the interpreter
|
||||
/// loop halts cleanly (matches the "entry returned" convention).
|
||||
pub const LR_HALT_SENTINEL: u64 = 0xBCBC_BCBC;
|
||||
|
||||
/// VSCR NJ (Non-Java mode) bit. Stored in word 3 at bit 16 (mask 0x0001_0000).
|
||||
/// Set at startup; when clear, denormals are flushed to zero following IEEE-754.
|
||||
pub const VSCR_NJ_MASK: u32 = 0x0001_0000;
|
||||
/// VSCR SAT (saturation sticky) bit. Stored in word 3 at bit 31 (mask 0x0000_0001).
|
||||
pub const VSCR_SAT_MASK: u32 = 0x0000_0001;
|
||||
|
||||
/// PowerPC processor context. Holds all register state for one guest thread.
|
||||
/// Mirrors PPCContext from ppc_context.h, minus JIT-specific fields.
|
||||
#[repr(C, align(64))]
|
||||
@@ -64,15 +85,39 @@ pub struct PpcContext {
|
||||
pub xer_ca: u8,
|
||||
pub xer_ov: u8,
|
||||
pub xer_so: u8,
|
||||
// Altivec VSCR saturation bit
|
||||
pub vscr_sat: u8,
|
||||
// Altivec VSCR. Only bits 16 (NJ) and 31 (SAT) of word 3 are meaningful.
|
||||
pub vscr: Vec128,
|
||||
// VRSAVE (SPR 256). Bitmask of which VRs need saving across context switches.
|
||||
pub vrsave: u32,
|
||||
|
||||
// Program counter
|
||||
pub pc: u32,
|
||||
// Reservation address/value for lwarx/stwcx
|
||||
pub reserved_addr: u32,
|
||||
// Reservation for lwarx/ldarx/stwcx/stdcx. Xenon's reservation granule is
|
||||
// one L2 cache line (128 bytes) — `reserved_line` is stored as the base
|
||||
// address of that line (`ea & !0x7F`). `has_reservation` gates the
|
||||
// validity; stwcx./stdcx. check that both match before committing.
|
||||
// `reserved_val` is retained for possible future use by a coherency
|
||||
// observer; the store-conditional logic itself does not compare it.
|
||||
pub reserved_line: u32,
|
||||
pub reserved_val: u64,
|
||||
pub has_reservation: bool,
|
||||
/// M3.7 — generation stamp returned by [`crate::ReservationTable::reserve`]
|
||||
/// at the most recent `lwarx`/`ldarx`. Paired with `reserved_line`;
|
||||
/// `stwcx.`/`stdcx.` pass this back to `try_commit`. Meaningful only
|
||||
/// when `reservation_table` is `Some` and the table is enabled.
|
||||
pub reserved_generation: u32,
|
||||
/// M3.7 — optional handle to the inter-thread reservation table.
|
||||
/// When `Some(table)` *and* `table.is_enabled()`, the interpreter's
|
||||
/// `lwarx`/`stwcx.`/`ldarx`/`stdcx.` arms route through the table;
|
||||
/// otherwise they use the legacy per-`PpcContext` fields above. The
|
||||
/// scheduler populates this when it spawns a thread under a kernel
|
||||
/// that has `reservations` set.
|
||||
pub reservation_table: Option<std::sync::Arc<crate::ReservationTable>>,
|
||||
/// M3.7 — emulated HW slot ID this thread is bound to. Used as the
|
||||
/// reservation table's `hw_id` discriminator so two threads on
|
||||
/// different slots can't accidentally commit each other's
|
||||
/// reservations. Populated by the scheduler at spawn / migration.
|
||||
pub hw_id: u8,
|
||||
|
||||
// Thread ID (for kernel use)
|
||||
pub thread_id: u32,
|
||||
@@ -82,6 +127,12 @@ pub struct PpcContext {
|
||||
|
||||
// Time base (incremented each instruction for debugging)
|
||||
pub timebase: u64,
|
||||
|
||||
// Decrementer (SPR 22): 32-bit down-counter that fires an external
|
||||
// interrupt at underflow on real hw. Xenia-rs doesn't dispatch DEC
|
||||
// interrupts to the guest; this value is maintained so that mfspr DEC
|
||||
// returns something coherent.
|
||||
pub dec: u32,
|
||||
}
|
||||
|
||||
impl PpcContext {
|
||||
@@ -89,7 +140,9 @@ impl PpcContext {
|
||||
Self {
|
||||
gpr: [0; 32],
|
||||
ctr: 0,
|
||||
lr: 0,
|
||||
// Canary sets LR to the halt sentinel at thread start so `blr`
|
||||
// from the top-level entry falls out of the interpreter loop.
|
||||
lr: LR_HALT_SENTINEL,
|
||||
msr: 0,
|
||||
fpr: [0.0; 32],
|
||||
vr: [Vec128::ZERO; 128],
|
||||
@@ -98,14 +151,21 @@ impl PpcContext {
|
||||
xer_ca: 0,
|
||||
xer_ov: 0,
|
||||
xer_so: 0,
|
||||
vscr_sat: 0,
|
||||
// VSCR starts with NJ bit set (denormals flushed) — matches canary
|
||||
// thread_state.cc initialization.
|
||||
vscr: Vec128::from_u32x4(0, 0, 0, VSCR_NJ_MASK),
|
||||
vrsave: 0xFFFF_FFFF,
|
||||
pc: 0,
|
||||
reserved_addr: 0,
|
||||
reserved_line: 0,
|
||||
reserved_val: 0,
|
||||
has_reservation: false,
|
||||
reserved_generation: 0,
|
||||
reservation_table: None,
|
||||
hw_id: 0,
|
||||
thread_id: 0,
|
||||
cycle_count: 0,
|
||||
timebase: 0,
|
||||
dec: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,6 +242,27 @@ impl PpcContext {
|
||||
self.xer_ov = ((val >> 30) & 1) as u8;
|
||||
self.xer_ca = ((val >> 29) & 1) as u8;
|
||||
}
|
||||
|
||||
/// Read the VSCR SAT (sticky saturation) bit.
|
||||
pub fn vscr_sat(&self) -> bool {
|
||||
(self.vscr.u32x4(3) & VSCR_SAT_MASK) != 0
|
||||
}
|
||||
|
||||
/// Set or clear VSCR SAT. Preserves the NJ bit (and any other word-3 bits).
|
||||
pub fn set_vscr_sat(&mut self, v: bool) {
|
||||
let mut w = self.vscr.u32x4(3);
|
||||
if v {
|
||||
w |= VSCR_SAT_MASK;
|
||||
} else {
|
||||
w &= !VSCR_SAT_MASK;
|
||||
}
|
||||
self.vscr.set_u32x4(3, w);
|
||||
}
|
||||
|
||||
/// Read the VSCR NJ (non-Java mode / flush-denormals) bit.
|
||||
pub fn vscr_nj(&self) -> bool {
|
||||
(self.vscr.u32x4(3) & VSCR_NJ_MASK) != 0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PpcContext {
|
||||
|
||||
@@ -77,6 +77,9 @@ impl DecodedInstr {
|
||||
/// OE bit (bit 21) - overflow enable
|
||||
#[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 }
|
||||
|
||||
/// TO field (bits 6-10) for tw/twi/td/tdi trap instructions.
|
||||
#[inline] pub fn to(&self) -> u32 { extract_bits(self.raw, 6, 10) }
|
||||
|
||||
/// MB, ME fields for rotate instructions
|
||||
#[inline] pub fn mb(&self) -> u32 { extract_bits(self.raw, 21, 25) }
|
||||
#[inline] pub fn me(&self) -> u32 { extract_bits(self.raw, 26, 30) }
|
||||
@@ -142,6 +145,24 @@ impl DecodedInstr {
|
||||
#[inline] pub fn nb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
|
||||
}
|
||||
|
||||
/// Extract the 5-bit `UIMM` (`VX128_3`) / `IMM` (`VX128_4`) field. Canary
|
||||
/// packs both formats with LSB-bits 16-20 holding the field, which is
|
||||
/// MSB bits 11-15 in our `extract_bits` convention. For `vpkd3d128` /
|
||||
/// `vupkd3d128` the decoded selector is `type = UIMM >> 2` (3 bits; valid
|
||||
/// values 0-6 per [`crate::vmx::D3dPackType`], 7 is undocumented /
|
||||
/// undefined in canary) and `pack = UIMM & 0x3` (output-slot layout for
|
||||
/// `vpkd3d128` only, `vupkd3d128` ignores it).
|
||||
///
|
||||
/// First-Pixels M3: the interpreter previously used a hand-rolled
|
||||
/// `(instr.raw >> 6) & 0x7` that was **LSB-numbered** and extracted
|
||||
/// bits from a completely different part of the word (the
|
||||
/// secondary-opcode region). Centralizing the extractor here matches
|
||||
/// canary's `FormatVX128_{3,4}::{UIMM,IMM}` field semantics exactly.
|
||||
#[inline]
|
||||
pub fn extract_vx128_uimm5(raw: u32) -> u32 {
|
||||
extract_bits(raw, 11, 15)
|
||||
}
|
||||
|
||||
/// Decode a 32-bit PPC instruction into its opcode.
|
||||
/// Direct translation of the C++ LookupOpcode from ppc_opcode_lookup_gen.cc.
|
||||
pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
|
||||
@@ -149,6 +170,123 @@ pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
|
||||
DecodedInstr { opcode, raw, addr }
|
||||
}
|
||||
|
||||
// Perf tier-2 — direct-mapped PC-keyed decode cache.
|
||||
//
|
||||
// The interpreter hot path spends ~15-25% of its time in `decode()`
|
||||
// parsing the raw u32 and walking the primary+secondary opcode tables.
|
||||
// For non-self-modifying guest code — the common case past the XEX
|
||||
// loader — `decode(raw, pc)` is purely a function of `(raw, pc)` and
|
||||
// the output is `Copy + 16B`. A direct-mapped cache indexed by
|
||||
// `(pc >> 2) & MASK` gives the interpreter a 1-comparison fast path,
|
||||
// at the cost of one branch and a 1.5 MiB region of memory.
|
||||
//
|
||||
// Invalidation piggybacks on `xenia_memory::GuestMemory::page_version`
|
||||
// (P5 texture-cache invalidation): every cache entry carries the page
|
||||
// version that was active at decode time; on lookup we compare against
|
||||
// the current version of the containing 4 KiB page. Any write to the
|
||||
// page bumps the counter, so the next decode on that PC is a miss that
|
||||
// refills.
|
||||
|
||||
/// Number of direct-mapped entries. 2^16 = 65,536 slots, one PPC
|
||||
/// instruction address per slot — enough for every hot code path in a
|
||||
/// typical Xbox 360 title to stay resident without collision.
|
||||
const DECODE_CACHE_SIZE: usize = 1 << 16;
|
||||
const DECODE_CACHE_MASK: u32 = (DECODE_CACHE_SIZE - 1) as u32;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct DecodeCacheEntry {
|
||||
/// Guest PC this entry was decoded at. Used as the tag on lookup; a
|
||||
/// mismatch means the slot was last populated by a different PC that
|
||||
/// shares the same low-16 index.
|
||||
pc: u32,
|
||||
/// Page version at decode time (from `GuestMemory::page_version(pc)`).
|
||||
/// Zero means "unused slot" since real page versions start at 1.
|
||||
page_version: u64,
|
||||
decoded: DecodedInstr,
|
||||
}
|
||||
|
||||
impl DecodeCacheEntry {
|
||||
const fn empty() -> Self {
|
||||
// `Invalid` is the decoder's "unrecognized opcode" sentinel; we
|
||||
// use it here as the empty-slot marker. Real misses compare `pc`,
|
||||
// not the opcode, so the sentinel choice is cosmetic.
|
||||
Self {
|
||||
pc: 0,
|
||||
page_version: 0,
|
||||
decoded: DecodedInstr {
|
||||
opcode: PpcOpcode::Invalid,
|
||||
raw: 0,
|
||||
addr: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Direct-mapped PC-keyed decode cache. One instance shared across all
|
||||
/// HW threads (PC is thread-independent; entries are read-only once
|
||||
/// filled). Not thread-safe — the single scheduler thread owns it.
|
||||
pub struct DecodeCache {
|
||||
slots: Box<[DecodeCacheEntry]>,
|
||||
hits: u64,
|
||||
misses: u64,
|
||||
invalidations: u64,
|
||||
}
|
||||
|
||||
impl Default for DecodeCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl DecodeCache {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
slots: vec![DecodeCacheEntry::empty(); DECODE_CACHE_SIZE].into_boxed_slice(),
|
||||
hits: 0,
|
||||
misses: 0,
|
||||
invalidations: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up (or fill) the decoded form of the instruction at `pc`.
|
||||
/// `raw` is the fetched instruction word; `current_page_version` is
|
||||
/// `mem.page_version(pc)` — the caller has it cheaper than we do,
|
||||
/// since they're already touching `mem` to fetch `raw`.
|
||||
#[inline]
|
||||
pub fn lookup(&mut self, pc: u32, raw: u32, current_page_version: u64) -> DecodedInstr {
|
||||
let idx = ((pc >> 2) & DECODE_CACHE_MASK) as usize;
|
||||
// Safety: `idx` is masked into `[0, DECODE_CACHE_SIZE)` so the
|
||||
// slice access is always in-bounds. Opt-out of the bounds check
|
||||
// for the hot path.
|
||||
let entry = unsafe { self.slots.get_unchecked_mut(idx) };
|
||||
if entry.pc == pc && entry.page_version == current_page_version {
|
||||
self.hits += 1;
|
||||
return entry.decoded;
|
||||
}
|
||||
if entry.pc == pc && entry.page_version != current_page_version {
|
||||
self.invalidations += 1;
|
||||
}
|
||||
self.misses += 1;
|
||||
let decoded = decode(raw, pc);
|
||||
*entry = DecodeCacheEntry {
|
||||
pc,
|
||||
page_version: current_page_version,
|
||||
decoded,
|
||||
};
|
||||
decoded
|
||||
}
|
||||
|
||||
pub fn hits(&self) -> u64 {
|
||||
self.hits
|
||||
}
|
||||
pub fn misses(&self) -> u64 {
|
||||
self.misses
|
||||
}
|
||||
pub fn invalidations(&self) -> u64 {
|
||||
self.invalidations
|
||||
}
|
||||
}
|
||||
|
||||
fn lookup_opcode(code: u32) -> PpcOpcode {
|
||||
match extract_bits(code, 0, 5) {
|
||||
2 => PpcOpcode::tdi,
|
||||
@@ -781,6 +919,57 @@ mod tests {
|
||||
assert_eq!(instr.d(), 0x20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_miss_fills_then_hit() {
|
||||
let mut cache = DecodeCache::new();
|
||||
let raw: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let pc = 0x8200_0000u32;
|
||||
let first = cache.lookup(pc, raw, 1);
|
||||
assert_eq!(first.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.hits(), 0);
|
||||
assert_eq!(cache.misses(), 1);
|
||||
// Same pc, same version → cache hit, no new decode.
|
||||
let second = cache.lookup(pc, raw, 1);
|
||||
assert_eq!(second.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.hits(), 1);
|
||||
assert_eq!(cache.misses(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_stale_version_refills() {
|
||||
let mut cache = DecodeCache::new();
|
||||
// First fill with an `addi`.
|
||||
let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let pc = 0x8200_0000u32;
|
||||
cache.lookup(pc, raw_addi, 1);
|
||||
// Guest rewrote the page: same pc, different raw + bumped version.
|
||||
// Cache must refill — not return the stale `addi`.
|
||||
let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
|
||||
let refreshed = cache.lookup(pc, raw_lwz, 2);
|
||||
assert_eq!(refreshed.opcode, PpcOpcode::lwz);
|
||||
assert_eq!(cache.invalidations(), 1);
|
||||
assert_eq!(cache.misses(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_pc_collision_refills() {
|
||||
// Two PCs that hash to the same slot (pc >> 2 low 16 bits equal)
|
||||
// must not alias. Slot index = ((pc >> 2) & 0xFFFF) — pick two
|
||||
// PCs 4 * 2^16 bytes apart.
|
||||
let mut cache = DecodeCache::new();
|
||||
let pc_a = 0x8200_0000u32;
|
||||
let pc_b = pc_a.wrapping_add(0x0004_0000u32); // (>> 2) differs by 2^16
|
||||
let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
|
||||
cache.lookup(pc_a, raw_addi, 1);
|
||||
// Different pc but same slot → miss + refill.
|
||||
cache.lookup(pc_b, raw_lwz, 1);
|
||||
// First pc comes back → miss + refill (slot was taken by pc_b).
|
||||
let back = cache.lookup(pc_a, raw_addi, 1);
|
||||
assert_eq!(back.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.misses(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_branch() {
|
||||
// b +0x100 => opcode 18, LI=0x40 (shifted left 2 = 0x100), AA=0, LK=0
|
||||
@@ -816,4 +1005,103 @@ mod tests {
|
||||
assert_eq!(extract_bits(0x8000_0000, 0, 0), 1);
|
||||
assert_eq!(extract_bits(0x0000_0001, 31, 31), 1);
|
||||
}
|
||||
|
||||
// VMX128 register-name extraction. Locks the canonical bit positions
|
||||
// (decoder.rs is the single source of truth — the analysis crate's
|
||||
// old `ppc.rs` had different positions, which produced wrong printed
|
||||
// register names; the bug was silent because the interpreter never
|
||||
// used those extractors). Each test poke-bits exactly the slots the
|
||||
// accessor reads and asserts the assembled register number.
|
||||
|
||||
fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32,
|
||||
vb16_20: u32, vb28: u32, vb30: u32) -> u32 {
|
||||
// PPC bit i -> LSB position 31-i.
|
||||
(vd6_10 << (31 - 10))
|
||||
| (va21 << (31 - 21)) // va128 high bit at PPC 29 in some forms — kept 0 here
|
||||
| (vd21 << (31 - 21))
|
||||
| (vd22 << (31 - 22))
|
||||
| (vb16_20 << (31 - 20))
|
||||
| (vb28 << (31 - 28))
|
||||
| (vb30 << (31 - 30))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_low_5_bits_only() {
|
||||
// vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo
|
||||
for r in 0..32u32 {
|
||||
let raw = (r as u32) << (31 - 10);
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), r as usize, "vd_lo={r}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_bit21_adds_32() {
|
||||
// vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32
|
||||
let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_bit22_adds_64() {
|
||||
// vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64
|
||||
let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_full_127() {
|
||||
// vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127
|
||||
let raw = (31u32 << (31 - 10))
|
||||
| (1u32 << (31 - 21))
|
||||
| (1u32 << (31 - 22));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 127);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_va128_uses_bit29() {
|
||||
// va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
|
||||
let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.va128(), 39);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vb128_uses_bits28_and_30() {
|
||||
// vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64.
|
||||
let raw = (5u32 << (31 - 20))
|
||||
| (1u32 << (31 - 28))
|
||||
| (1u32 << (31 - 30));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vb128(), 5 | 32 | 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vs128_aliases_vd128() {
|
||||
// vs128 must always equal vd128.
|
||||
for r in [0u32, 31, 32, 64, 96, 127] {
|
||||
let lo = r & 0x1F;
|
||||
let b21 = (r >> 5) & 1;
|
||||
let b22 = (r >> 6) & 1;
|
||||
let raw = (lo << (31 - 10))
|
||||
| (b21 << (31 - 21))
|
||||
| (b22 << (31 - 22));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}");
|
||||
assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}");
|
||||
assert_eq!(d.vd128(), d.vs128());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[allow(dead_code)]
|
||||
fn _vmx128_test_word_helper_compiles() {
|
||||
// Keep the helper validated against the real accessor.
|
||||
let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0);
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 5 | 32 | 64);
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
384
crates/xenia-cpu/src/fpscr.rs
Normal file
384
crates/xenia-cpu/src/fpscr.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! FPSCR (Floating-Point Status and Control Register) maintenance.
|
||||
//!
|
||||
//! Scope per project plan: rounding modes honoured, plus the exception bits
|
||||
//! games actually read (FX, FEX, VX, OX, UX, ZX, XX, FI, FPRF). Enabled-
|
||||
//! exception dispatch (FE[0,1], VE/OE/UE/ZE/XE) is *not* modelled — games
|
||||
//! running on Xenon almost never take FP traps.
|
||||
//!
|
||||
//! Bit layout (PowerISA, MSB-0 numbering; stored in a u32 with bit 31 = MSB):
|
||||
//!
|
||||
//! | PPC bit | u32 mask | Name |
|
||||
//! |---------|-------------------------|-------------|
|
||||
//! | 0 | `1<<31` | FX |
|
||||
//! | 1 | `1<<30` | FEX |
|
||||
//! | 2 | `1<<29` | VX (summary)|
|
||||
//! | 3 | `1<<28` | OX |
|
||||
//! | 4 | `1<<27` | UX |
|
||||
//! | 5 | `1<<26` | ZX |
|
||||
//! | 6 | `1<<25` | XX |
|
||||
//! | 7 | `1<<24` | VXSNAN |
|
||||
//! | 8 | `1<<23` | VXISI |
|
||||
//! | 9 | `1<<22` | VXIDI |
|
||||
//! | 10 | `1<<21` | VXZDZ |
|
||||
//! | 11 | `1<<20` | VXIMZ |
|
||||
//! | 12 | `1<<19` | VXVC |
|
||||
//! | 13 | `1<<18` | FR |
|
||||
//! | 14 | `1<<17` | FI |
|
||||
//! | 15..19 | `0xF8000 >> 15` @ 15..19 | FPRF (5 bits)|
|
||||
//! | 21 | `1<<10` | VXSOFT |
|
||||
//! | 22 | `1<<9` | VXSQRT |
|
||||
//! | 23 | `1<<8` | VXCVI |
|
||||
//! | 30..31 | `0x3` | RN (2 bits) |
|
||||
|
||||
use crate::context::PpcContext;
|
||||
|
||||
pub const FX: u32 = 1 << 31;
|
||||
pub const FEX: u32 = 1 << 30;
|
||||
pub const VX: u32 = 1 << 29;
|
||||
pub const OX: u32 = 1 << 28;
|
||||
pub const UX: u32 = 1 << 27;
|
||||
pub const ZX: u32 = 1 << 26;
|
||||
pub const XX: u32 = 1 << 25;
|
||||
pub const VXSNAN: u32 = 1 << 24;
|
||||
pub const VXISI: u32 = 1 << 23;
|
||||
pub const VXIDI: u32 = 1 << 22;
|
||||
pub const VXZDZ: u32 = 1 << 21;
|
||||
pub const VXIMZ: u32 = 1 << 20;
|
||||
pub const VXVC: u32 = 1 << 19;
|
||||
pub const FR: u32 = 1 << 18;
|
||||
pub const FI: u32 = 1 << 17;
|
||||
pub const FPRF_MASK: u32 = 0x1F << 12; // bits 15..19
|
||||
pub const VXSOFT: u32 = 1 << 10;
|
||||
pub const VXSQRT: u32 = 1 << 9;
|
||||
pub const VXCVI: u32 = 1 << 8;
|
||||
pub const RN_MASK: u32 = 0x3;
|
||||
|
||||
/// Union of all VX* bits (used for the VX summary recomputation).
|
||||
pub const VX_ALL: u32 = VXSNAN | VXISI | VXIDI | VXZDZ | VXIMZ | VXVC | VXSOFT | VXSQRT | VXCVI;
|
||||
|
||||
/// FPRF classification codes (5-bit, placed in FPSCR bits 15..19).
|
||||
/// The high bit ("C" in PowerISA) distinguishes ±zero/±denormal/QNaN from
|
||||
/// ±normal/±inf. The next 4 bits are (FL, FG, FE, FU) = (less, greater, equal, unordered).
|
||||
pub mod fprf {
|
||||
pub const QNAN: u8 = 0b1_0001;
|
||||
pub const NEG_INF: u8 = 0b0_1001;
|
||||
pub const NEG_NORMAL: u8 = 0b0_1000;
|
||||
pub const NEG_DENORMAL: u8 = 0b1_1000;
|
||||
pub const NEG_ZERO: u8 = 0b1_0010;
|
||||
pub const POS_ZERO: u8 = 0b0_0010;
|
||||
pub const POS_DENORMAL: u8 = 0b1_0100;
|
||||
pub const POS_NORMAL: u8 = 0b0_0100;
|
||||
pub const POS_INF: u8 = 0b0_0101;
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum RoundingMode {
|
||||
NearestEven, // RN=00
|
||||
TowardZero, // RN=01
|
||||
TowardPosInf, // RN=10
|
||||
TowardNegInf, // RN=11
|
||||
}
|
||||
|
||||
pub fn rounding_mode(ctx: &PpcContext) -> RoundingMode {
|
||||
match ctx.fpscr & RN_MASK {
|
||||
0 => RoundingMode::NearestEven,
|
||||
1 => RoundingMode::TowardZero,
|
||||
2 => RoundingMode::TowardPosInf,
|
||||
_ => RoundingMode::TowardNegInf,
|
||||
}
|
||||
}
|
||||
|
||||
/// Classify a finite f64 into its FPRF 5-bit code.
|
||||
pub fn classify_fprf(v: f64) -> u8 {
|
||||
if v.is_nan() {
|
||||
fprf::QNAN
|
||||
} else if v.is_infinite() {
|
||||
if v.is_sign_negative() { fprf::NEG_INF } else { fprf::POS_INF }
|
||||
} else if v == 0.0 {
|
||||
if v.is_sign_negative() { fprf::NEG_ZERO } else { fprf::POS_ZERO }
|
||||
} else if v.is_subnormal() {
|
||||
if v.is_sign_negative() { fprf::NEG_DENORMAL } else { fprf::POS_DENORMAL }
|
||||
} else if v.is_sign_negative() { fprf::NEG_NORMAL } else { fprf::POS_NORMAL }
|
||||
}
|
||||
|
||||
/// Write FPRF into FPSCR, preserving other bits.
|
||||
pub fn set_fprf(ctx: &mut PpcContext, code: u8) {
|
||||
ctx.fpscr = (ctx.fpscr & !FPRF_MASK) | ((code as u32 & 0x1F) << 12);
|
||||
}
|
||||
|
||||
/// Set one or more exception bits on FPSCR, maintaining FX (sticky set on any
|
||||
/// new exception) and VX (summary of VX* bits).
|
||||
pub fn set_exception(ctx: &mut PpcContext, bits: u32) {
|
||||
let prev = ctx.fpscr;
|
||||
let new = prev | bits;
|
||||
// FX is sticky-set if any new non-sticky bit transitions to 1. PPC defines
|
||||
// FX as "any of OX, UX, ZX, XX, VX* newly set". Compute the transition set.
|
||||
let transition = (new & !prev) & (OX | UX | ZX | XX | VX_ALL);
|
||||
let mut updated = new;
|
||||
if transition != 0 {
|
||||
updated |= FX;
|
||||
}
|
||||
// Recompute VX summary from any VX* bits currently set.
|
||||
if (updated & VX_ALL) != 0 { updated |= VX; }
|
||||
ctx.fpscr = updated;
|
||||
}
|
||||
|
||||
/// Classify the inputs of a floating-point arithmetic op and set appropriate
|
||||
/// VX* bits. Returns true if any invalid-operation was detected (caller may
|
||||
/// want to write a default QNaN result).
|
||||
///
|
||||
/// Detected cases:
|
||||
/// * any SNaN input → VXSNAN
|
||||
/// * infinity - infinity (same sign) → VXISI
|
||||
/// * 0 / 0 → VXZDZ
|
||||
/// * infinity / infinity → VXIDI
|
||||
/// * 0 * infinity → VXIMZ
|
||||
pub fn check_invalid_add(ctx: &mut PpcContext, a: f64, b: f64, sub: bool) -> bool {
|
||||
let mut bits = 0u32;
|
||||
if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
|
||||
if a.is_infinite() && b.is_infinite() {
|
||||
// For add: VXISI iff same-sign(a,b) negated — inf - inf
|
||||
// For sub: VXISI iff same-sign(a,b) — (+inf) - (+inf) or (-inf) - (-inf)
|
||||
let both_pos = a.is_sign_positive() && b.is_sign_positive();
|
||||
let both_neg = a.is_sign_negative() && b.is_sign_negative();
|
||||
if sub {
|
||||
if both_pos || both_neg { bits |= VXISI; }
|
||||
} else {
|
||||
// add: opposite signs cancel to inf-inf
|
||||
if a.is_sign_positive() != b.is_sign_positive() { bits |= VXISI; }
|
||||
}
|
||||
}
|
||||
if bits != 0 { set_exception(ctx, bits); return true; }
|
||||
false
|
||||
}
|
||||
|
||||
pub fn check_invalid_mul(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
|
||||
let mut bits = 0u32;
|
||||
if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
|
||||
let zero_times_inf =
|
||||
(a == 0.0 && b.is_infinite()) || (b == 0.0 && a.is_infinite());
|
||||
if zero_times_inf { bits |= VXIMZ; }
|
||||
if bits != 0 { set_exception(ctx, bits); return true; }
|
||||
false
|
||||
}
|
||||
|
||||
pub fn check_invalid_div(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
|
||||
let mut bits = 0u32;
|
||||
if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
|
||||
if a == 0.0 && b == 0.0 { bits |= VXZDZ; }
|
||||
if a.is_infinite() && b.is_infinite() { bits |= VXIDI; }
|
||||
if bits != 0 { set_exception(ctx, bits); return true; }
|
||||
false
|
||||
}
|
||||
|
||||
/// Divide-by-zero (finite nonzero / 0) — sets ZX but not VX.
|
||||
pub fn check_zero_divide(ctx: &mut PpcContext, a: f64, b: f64) {
|
||||
if b == 0.0 && a != 0.0 && !a.is_nan() && !a.is_infinite() {
|
||||
set_exception(ctx, ZX);
|
||||
}
|
||||
}
|
||||
|
||||
/// Post-op: classify the result and update FPRF + detect overflow/underflow/inexact.
|
||||
/// `inputs_finite` lets us suppress OX for ops whose output is infinite because
|
||||
/// an input already was.
|
||||
pub fn update_after_op(ctx: &mut PpcContext, result: f64, inputs_were_finite: bool) {
|
||||
let mut bits = 0u32;
|
||||
if result.is_infinite() && inputs_were_finite {
|
||||
bits |= OX;
|
||||
}
|
||||
if result.is_subnormal() {
|
||||
bits |= UX;
|
||||
}
|
||||
if bits != 0 { set_exception(ctx, bits); }
|
||||
set_fprf(ctx, classify_fprf(result));
|
||||
}
|
||||
|
||||
/// Test whether an f64 is a signalling NaN.
|
||||
/// In IEEE 754-2008 (binary64), the signalling bit is the high bit of the
|
||||
/// mantissa. SNaN has it clear, QNaN has it set. NaN with high mantissa bit
|
||||
/// clear (and mantissa nonzero) is an SNaN.
|
||||
pub fn is_snan(x: f64) -> bool {
|
||||
if !x.is_nan() { return false; }
|
||||
let bits = x.to_bits();
|
||||
// Highest mantissa bit (bit 51) clear ⇒ SNaN. Mantissa nonzero always true for NaN.
|
||||
(bits & (1u64 << 51)) == 0
|
||||
}
|
||||
|
||||
/// Round an f64 to f32 honouring FPSCR[RN]. Uses the current hardware
|
||||
/// rounding mode when RN=0 (nearest-even, the PPC default), otherwise
|
||||
/// emulates the directed rounding via bit-manipulation.
|
||||
pub fn round_to_single(ctx: &PpcContext, v: f64) -> f64 {
|
||||
match rounding_mode(ctx) {
|
||||
RoundingMode::NearestEven => (v as f32) as f64,
|
||||
RoundingMode::TowardZero => round_single_toward_zero(v) as f64,
|
||||
RoundingMode::TowardPosInf => round_single_toward_pos_inf(v) as f64,
|
||||
RoundingMode::TowardNegInf => round_single_toward_neg_inf(v) as f64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Round an f64 to an i64 integer honouring FPSCR[RN]. Used by fctidx.
|
||||
pub fn round_to_i64(ctx: &PpcContext, v: f64) -> i64 {
|
||||
match rounding_mode(ctx) {
|
||||
RoundingMode::NearestEven => {
|
||||
// Round-half-to-even (banker's rounding).
|
||||
let r = v.round();
|
||||
// Rust's f64::round is round-half-away-from-zero. Correct ties to even:
|
||||
let diff = (v - v.trunc()).abs();
|
||||
if (diff - 0.5).abs() < f64::EPSILON {
|
||||
let floor = v.floor();
|
||||
if (floor as i64) & 1 == 0 { floor as i64 } else { v.ceil() as i64 }
|
||||
} else {
|
||||
r as i64
|
||||
}
|
||||
}
|
||||
RoundingMode::TowardZero => v.trunc() as i64,
|
||||
RoundingMode::TowardPosInf => v.ceil() as i64,
|
||||
RoundingMode::TowardNegInf => v.floor() as i64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Round an f64 to an i32 integer honouring FPSCR[RN]. Used by fctiwx.
|
||||
pub fn round_to_i32(ctx: &PpcContext, v: f64) -> i32 {
|
||||
round_to_i64(ctx, v).clamp(i32::MIN as i64, i32::MAX as i64) as i32
|
||||
}
|
||||
|
||||
// ------ directed rounding helpers (f64 → f32) ------
|
||||
|
||||
fn round_single_toward_zero(v: f64) -> f32 {
|
||||
// Default f64→f32 is round-to-nearest-even. Emulate truncation:
|
||||
// take the default rounded value; if the absolute rounded magnitude
|
||||
// exceeds |v|, bump down by one ULP toward zero.
|
||||
let rn = v as f32;
|
||||
if rn.is_nan() || rn.is_infinite() || rn == 0.0 { return rn; }
|
||||
if rn.abs() as f64 <= v.abs() { return rn; }
|
||||
let adj_bits = rn.to_bits();
|
||||
let lower = if rn.is_sign_positive() { adj_bits - 1 } else { adj_bits - 1 };
|
||||
f32::from_bits(lower)
|
||||
}
|
||||
|
||||
fn round_single_toward_pos_inf(v: f64) -> f32 {
|
||||
let rn = v as f32;
|
||||
if rn.is_nan() || rn.is_infinite() { return rn; }
|
||||
if (rn as f64) >= v { return rn; }
|
||||
// rn < v — bump up by one ULP in the +direction.
|
||||
let b = rn.to_bits();
|
||||
let nb = if rn.is_sign_negative() { b - 1 } else { b + 1 };
|
||||
f32::from_bits(nb)
|
||||
}
|
||||
|
||||
fn round_single_toward_neg_inf(v: f64) -> f32 {
|
||||
let rn = v as f32;
|
||||
if rn.is_nan() || rn.is_infinite() { return rn; }
|
||||
if (rn as f64) <= v { return rn; }
|
||||
// rn > v — bump down.
|
||||
let b = rn.to_bits();
|
||||
let nb = if rn.is_sign_negative() { b + 1 } else { b - 1 };
|
||||
f32::from_bits(nb)
|
||||
}
|
||||
|
||||
/// Drop-in replacement for the old `update_cr1_from_fpscr`. Reads the
|
||||
/// currently-maintained FPSCR bits (FX, FEX, VX, OX) into CR1.
|
||||
pub fn update_cr1(ctx: &mut PpcContext) {
|
||||
ctx.cr[1].lt = (ctx.fpscr & FX) != 0;
|
||||
ctx.cr[1].gt = (ctx.fpscr & FEX) != 0;
|
||||
ctx.cr[1].eq = (ctx.fpscr & VX) != 0;
|
||||
ctx.cr[1].so = (ctx.fpscr & OX) != 0;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn ctx() -> PpcContext { PpcContext::new() }
|
||||
|
||||
#[test]
|
||||
fn rn_default_is_nearest() {
|
||||
assert_eq!(rounding_mode(&ctx()), RoundingMode::NearestEven);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rn_bits_decode() {
|
||||
let mut c = ctx();
|
||||
c.fpscr = 0x1;
|
||||
assert_eq!(rounding_mode(&c), RoundingMode::TowardZero);
|
||||
c.fpscr = 0x2;
|
||||
assert_eq!(rounding_mode(&c), RoundingMode::TowardPosInf);
|
||||
c.fpscr = 0x3;
|
||||
assert_eq!(rounding_mode(&c), RoundingMode::TowardNegInf);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fprf_classifies_correctly() {
|
||||
assert_eq!(classify_fprf(1.0), fprf::POS_NORMAL);
|
||||
assert_eq!(classify_fprf(-1.0), fprf::NEG_NORMAL);
|
||||
assert_eq!(classify_fprf(0.0), fprf::POS_ZERO);
|
||||
assert_eq!(classify_fprf(-0.0), fprf::NEG_ZERO);
|
||||
assert_eq!(classify_fprf(f64::INFINITY), fprf::POS_INF);
|
||||
assert_eq!(classify_fprf(f64::NEG_INFINITY), fprf::NEG_INF);
|
||||
assert_eq!(classify_fprf(f64::NAN), fprf::QNAN);
|
||||
assert_eq!(classify_fprf(f64::from_bits(1)), fprf::POS_DENORMAL);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fx_is_sticky_on_new_exception() {
|
||||
let mut c = ctx();
|
||||
set_exception(&mut c, OX);
|
||||
assert_ne!(c.fpscr & FX, 0);
|
||||
// Clear FX/OX manually.
|
||||
c.fpscr &= !(FX | OX);
|
||||
// Re-set OX; FX should re-latch.
|
||||
set_exception(&mut c, OX);
|
||||
assert_ne!(c.fpscr & FX, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vx_summary_set_on_any_vx_bit() {
|
||||
let mut c = ctx();
|
||||
set_exception(&mut c, VXSNAN);
|
||||
assert_ne!(c.fpscr & VX, 0);
|
||||
assert_ne!(c.fpscr & VXSNAN, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_single_nearest_is_identity_on_representable() {
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_single(&c, 1.0_f64), 1.0_f64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_i32_clamps_out_of_range() {
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_i32(&c, 1e20_f64), i32::MAX);
|
||||
assert_eq!(round_to_i32(&c, -1e20_f64), i32::MIN);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_to_i64_nearest_even_on_tie() {
|
||||
let c = ctx();
|
||||
assert_eq!(round_to_i64(&c, 2.5_f64), 2);
|
||||
assert_eq!(round_to_i64(&c, 3.5_f64), 4);
|
||||
assert_eq!(round_to_i64(&c, -2.5_f64), -2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn check_invalid_add_detects_inf_minus_inf() {
|
||||
let mut c = ctx();
|
||||
assert!(check_invalid_add(&mut c, f64::INFINITY, f64::INFINITY, true));
|
||||
assert_ne!(c.fpscr & VXISI, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn check_invalid_div_detects_zero_over_zero() {
|
||||
let mut c = ctx();
|
||||
assert!(check_invalid_div(&mut c, 0.0, 0.0));
|
||||
assert_ne!(c.fpscr & VXZDZ, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snan_detection() {
|
||||
// SNaN in binary64: sign=0, exp=all-ones, mantissa nonzero with bit 51 clear.
|
||||
let snan = f64::from_bits(0x7FF0_0000_0000_0001);
|
||||
assert!(is_snan(snan));
|
||||
assert!(!is_snan(f64::NAN));
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,25 @@
|
||||
pub mod block_cache;
|
||||
pub mod context;
|
||||
pub mod decoder;
|
||||
pub mod disasm;
|
||||
pub mod fpscr;
|
||||
pub mod interpreter;
|
||||
pub mod opcode;
|
||||
pub mod overflow;
|
||||
pub mod phaser;
|
||||
pub mod reservation;
|
||||
pub mod scheduler;
|
||||
pub mod trap;
|
||||
pub mod vmx;
|
||||
|
||||
pub use context::PpcContext;
|
||||
pub use decoder::decode;
|
||||
pub use disasm::{DisasmItem, DisasmText, disassemble, format as disasm_format, iter_disasm};
|
||||
pub use opcode::PpcOpcode;
|
||||
pub use phaser::{Phaser, PhaserOutcome};
|
||||
pub use reservation::ReservationTable;
|
||||
pub use scheduler::{
|
||||
BlockReason, GuestThread, HwSlot, HwState, MigrationFixup, OrderMode, PcrWriter, RoundOutcome,
|
||||
Scheduler, SpawnError, SpawnParams, ThreadRef, HW_THREAD_COUNT, INITIAL_GUEST_TID,
|
||||
QUANTUM_DEFAULT,
|
||||
};
|
||||
|
||||
@@ -145,6 +145,33 @@ impl PpcOpcode {
|
||||
matches!(self, Self::sc)
|
||||
}
|
||||
|
||||
/// Returns true if this opcode unconditionally ends a basic block:
|
||||
/// any branch, system call, trap, or `Invalid` (decoder couldn't
|
||||
/// recognize the instruction — execution will hit the
|
||||
/// `Unimplemented` arm and we don't want to swallow the boundary
|
||||
/// inside a cached block).
|
||||
///
|
||||
/// Notably *not* terminating: `mtmsr`/`mtmsrd`/`isync`/`mfmsr`.
|
||||
/// On real hardware these have synchronization semantics (a context
|
||||
/// synchronizing event for `isync`, MSR rewrite for the `mt*`s) but
|
||||
/// our interpreter has no asynchronous-exception model and no
|
||||
/// out-of-order execution — they execute as plain ALU/move ops and
|
||||
/// don't change control flow synchronously. Block-cache replay is
|
||||
/// still bit-for-bit identical to per-instruction dispatch for
|
||||
/// those.
|
||||
///
|
||||
/// Used by the basic-block cache (`block_cache.rs`) to know when to
|
||||
/// stop accumulating instructions during a forward decode walk.
|
||||
pub fn terminates_block(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::bx | Self::bcx | Self::bclrx | Self::bcctrx
|
||||
| Self::sc
|
||||
| Self::td | Self::tdi | Self::tw | Self::twi
|
||||
| Self::Invalid
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns true if this is a load instruction.
|
||||
pub fn is_load(&self) -> bool {
|
||||
matches!(self,
|
||||
@@ -194,3 +221,60 @@ impl std::fmt::Display for PpcOpcode {
|
||||
std::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn terminates_block_includes_all_branches() {
|
||||
assert!(PpcOpcode::bx.terminates_block());
|
||||
assert!(PpcOpcode::bcx.terminates_block());
|
||||
assert!(PpcOpcode::bclrx.terminates_block());
|
||||
assert!(PpcOpcode::bcctrx.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_includes_sc_and_traps() {
|
||||
assert!(PpcOpcode::sc.terminates_block());
|
||||
assert!(PpcOpcode::td.terminates_block());
|
||||
assert!(PpcOpcode::tdi.terminates_block());
|
||||
assert!(PpcOpcode::tw.terminates_block());
|
||||
assert!(PpcOpcode::twi.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_includes_invalid() {
|
||||
// Decoder failure must end the block — otherwise an unknown
|
||||
// opcode would be replayed inside a cached block without going
|
||||
// through the per-instruction Unimplemented path.
|
||||
assert!(PpcOpcode::Invalid.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_excludes_straight_line_ops() {
|
||||
// Common ALU and load/store ops must NOT terminate a block.
|
||||
assert!(!PpcOpcode::addi.terminates_block());
|
||||
assert!(!PpcOpcode::addis.terminates_block());
|
||||
assert!(!PpcOpcode::addx.terminates_block());
|
||||
assert!(!PpcOpcode::cmpi.terminates_block());
|
||||
assert!(!PpcOpcode::cmp.terminates_block());
|
||||
assert!(!PpcOpcode::lwz.terminates_block());
|
||||
assert!(!PpcOpcode::stw.terminates_block());
|
||||
assert!(!PpcOpcode::lbzx.terminates_block());
|
||||
assert!(!PpcOpcode::ori.terminates_block());
|
||||
assert!(!PpcOpcode::oris.terminates_block());
|
||||
assert!(!PpcOpcode::rlwinmx.terminates_block());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminates_block_excludes_msr_and_sync_ops() {
|
||||
// Documented decision: synchronizing ops execute as ALU within
|
||||
// a block since the interpreter has no async-exception model.
|
||||
assert!(!PpcOpcode::mtmsr.terminates_block());
|
||||
assert!(!PpcOpcode::mtmsrd.terminates_block());
|
||||
assert!(!PpcOpcode::isync.terminates_block());
|
||||
assert!(!PpcOpcode::sync.terminates_block());
|
||||
assert!(!PpcOpcode::mfmsr.terminates_block());
|
||||
}
|
||||
}
|
||||
|
||||
173
crates/xenia-cpu/src/overflow.rs
Normal file
173
crates/xenia-cpu/src/overflow.rs
Normal file
@@ -0,0 +1,173 @@
|
||||
//! OE / XER[OV] / XER[SO] handling for integer arithmetic.
|
||||
//!
|
||||
//! PPC integer ops with the OE bit set update XER[OV] (overflow) and sticky-set
|
||||
//! XER[SO]. When OE is clear the instruction leaves XER untouched. Signed
|
||||
//! overflow is predicated on the operation width and operand signs per the
|
||||
//! PowerISA pseudocode. For 32-bit-word operations (`addw`, `mullw`, `divw`,
|
||||
//! `neg`, etc. — on PPC these all have `w` in the mnemonic in spec
|
||||
//! descriptions even when the assembler spells them without) the predicate
|
||||
//! uses the low 32 bits. For 64-bit operations (`add`, `mulld`, `divd`) the
|
||||
//! predicate uses the full 64 bits.
|
||||
|
||||
use crate::context::PpcContext;
|
||||
|
||||
#[inline]
|
||||
pub fn apply(ctx: &mut PpcContext, overflowed: bool) {
|
||||
if overflowed {
|
||||
ctx.xer_ov = 1;
|
||||
ctx.xer_so = 1;
|
||||
} else {
|
||||
ctx.xer_ov = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Signed addition overflow at width-64 (plain `add`, `addc`, `subf`, `subfc`).
|
||||
///
|
||||
/// Predicate: same-sign inputs with opposite-sign result.
|
||||
/// For sub callers, rewrite as `a + b'` first (see `_sub`).
|
||||
#[inline]
|
||||
pub fn add_ov_64(a: u64, b: u64, result: u64) -> bool {
|
||||
((!(a ^ b)) & (a ^ result)) >> 63 != 0
|
||||
}
|
||||
|
||||
/// Universal signed-overflow predicate for 64-bit arithmetic.
|
||||
///
|
||||
/// Caller computes the mathematical (infinite-precision) signed sum as i128,
|
||||
/// plus the stored 64-bit result. Overflow iff the two disagree — i.e. the
|
||||
/// true value doesn't fit in i64.
|
||||
///
|
||||
/// Use this for multi-term chains (`adde`, `addme`, `addze`, `subfe`, `subfme`,
|
||||
/// `subfze`) where the carry-in makes the bit-predicate above awkward.
|
||||
#[inline]
|
||||
pub fn sum_overflow_64(true_sum: i128, result: u64) -> bool {
|
||||
true_sum != (result as i64) as i128
|
||||
}
|
||||
|
||||
/// Signed subtraction: RT = b - a. Overflow iff opposite-sign inputs with
|
||||
/// result sign != b's sign. Equivalently, reduce to addition with `!a + 1`.
|
||||
#[inline]
|
||||
pub fn sub_ov_64(a: u64, b: u64, result: u64) -> bool {
|
||||
((a ^ b) & (b ^ result)) >> 63 != 0
|
||||
}
|
||||
|
||||
/// Signed `addc`/`adde` chain overflow. Same rule as `add_ov_64` — the carry
|
||||
/// in doesn't alter the sign predicate directly because it's already folded
|
||||
/// into the stored result.
|
||||
#[inline]
|
||||
pub fn adde_ov_64(a: u64, b: u64, result: u64) -> bool {
|
||||
add_ov_64(a, b, result)
|
||||
}
|
||||
|
||||
/// Signed 32-bit multiply overflow (`mullwo`): result fits in 32 bits signed
|
||||
/// iff bit 32 equals bits 33..63 of the 64-bit product.
|
||||
#[inline]
|
||||
pub fn mullw_ov(product: i64) -> bool {
|
||||
let lo = product as i32 as i64;
|
||||
lo != product
|
||||
}
|
||||
|
||||
/// Signed 64-bit multiply overflow (`mulldo`). Detected via checked_mul.
|
||||
#[inline]
|
||||
pub fn mulld_ov(a: i64, b: i64) -> bool {
|
||||
a.checked_mul(b).is_none()
|
||||
}
|
||||
|
||||
/// `divwo` / `divwuo` / `divdo` / `divduo` raise OV in two cases:
|
||||
/// * divisor is zero, or
|
||||
/// * signed division of `INT_MIN / -1` (quotient doesn't fit).
|
||||
#[inline]
|
||||
pub fn divw_ov_signed(ra: i32, rb: i32) -> bool {
|
||||
rb == 0 || (ra == i32::MIN && rb == -1)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn divw_ov_unsigned(rb: u32) -> bool {
|
||||
rb == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn divd_ov_signed(ra: i64, rb: i64) -> bool {
|
||||
rb == 0 || (ra == i64::MIN && rb == -1)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn divd_ov_unsigned(rb: u64) -> bool {
|
||||
rb == 0
|
||||
}
|
||||
|
||||
/// `negx`: RT = -(RA). Overflow only when RA = INT_MIN (the negation doesn't fit).
|
||||
#[inline]
|
||||
pub fn neg_ov_64(ra: u64) -> bool {
|
||||
ra == 0x8000_0000_0000_0000
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn add_no_overflow() {
|
||||
assert!(!add_ov_64(1, 2, 3));
|
||||
assert!(!add_ov_64(u64::MAX, 0, u64::MAX));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_positive_overflow() {
|
||||
// INT64_MAX + 1 = INT64_MIN — signed overflow
|
||||
let a = i64::MAX as u64;
|
||||
let b = 1u64;
|
||||
let r = a.wrapping_add(b);
|
||||
assert!(add_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_negative_overflow() {
|
||||
// INT64_MIN + -1 = INT64_MAX — signed overflow
|
||||
let a = i64::MIN as u64;
|
||||
let b = (-1i64) as u64;
|
||||
let r = a.wrapping_add(b);
|
||||
assert!(add_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub_overflow_min_minus_pos() {
|
||||
// INT64_MIN - 1 overflows
|
||||
let b = i64::MIN as u64;
|
||||
let a = 1u64;
|
||||
let r = b.wrapping_sub(a);
|
||||
assert!(sub_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub_no_overflow() {
|
||||
let b = 5u64;
|
||||
let a = 2u64;
|
||||
let r = b.wrapping_sub(a);
|
||||
assert!(!sub_ov_64(a, b, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mullw_fits_32_bits() {
|
||||
assert!(!mullw_ov((i32::MAX as i64) * 1));
|
||||
assert!(!mullw_ov(-1i64));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mullw_overflows_32_bits() {
|
||||
let p = (i32::MAX as i64) * 2;
|
||||
assert!(mullw_ov(p));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mulld_overflows() {
|
||||
assert!(mulld_ov(i64::MAX, 2));
|
||||
assert!(!mulld_ov(i64::MAX, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn neg_ov_only_at_min() {
|
||||
assert!(neg_ov_64(i64::MIN as u64));
|
||||
assert!(!neg_ov_64(0));
|
||||
assert!(!neg_ov_64(1));
|
||||
}
|
||||
}
|
||||
345
crates/xenia-cpu/src/phaser.rs
Normal file
345
crates/xenia-cpu/src/phaser.rs
Normal file
@@ -0,0 +1,345 @@
|
||||
//! Quantum-boundary phaser for the M3 per-HW-thread parallel scheduler.
|
||||
//!
|
||||
//! Six [`super::HW_THREAD_COUNT`] host threads run their slots' interpreters
|
||||
//! in parallel, then meet at a phaser to advance to the next quantum. This
|
||||
//! is **not** [`std::sync::Barrier`]: a Barrier needs a fixed party count,
|
||||
//! but our slots can become idle (no runnable thread) and shouldn't block
|
||||
//! the phaser arrival.
|
||||
//!
|
||||
//! ## Semantics
|
||||
//!
|
||||
//! - Each slot at the end of its quantum either calls
|
||||
//! [`Phaser::arrive_and_wait`] (it has a runnable thread to run next
|
||||
//! quantum) or [`Phaser::skip`] (it's idle this round and will wake on
|
||||
//! `slot_wake[i]`).
|
||||
//! - The phase advances when **all 6 slots have either arrived or
|
||||
//! skipped**. Arrived slots block until the advance; skipped slots
|
||||
//! return immediately and re-poll their wake state.
|
||||
//! - The phaser uses a generation counter so a slot that arrives "early"
|
||||
//! in the next phase doesn't see the prior phase's "all arrived"
|
||||
//! condition.
|
||||
//! - Defensive timeout: [`Phaser::arrive_and_wait_timeout`] returns
|
||||
//! [`PhaserOutcome::Timeout`] if a peer crashes / hangs. Callers
|
||||
//! typically convert this into a graceful shutdown rather than
|
||||
//! panicking, so the rest of the topology can tear down cleanly.
|
||||
//!
|
||||
//! ## Memory ordering
|
||||
//!
|
||||
//! - The participant counter (`arrived` + `skipped`) uses `AcqRel` on
|
||||
//! the increment so the last-to-arrive thread sees a consistent
|
||||
//! "everyone is here" snapshot.
|
||||
//! - The generation `phase` is read with `Acquire` in arrivers' wait
|
||||
//! loops; the advancing thread stores with `Release` after bumping.
|
||||
//! - The condvar's broadcast publishes the phase; the wait loop
|
||||
//! re-checks `phase` against its captured value to defend against
|
||||
//! spurious wakeups.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Condvar, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Outcome of a phaser arrival.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum PhaserOutcome {
|
||||
/// All participants arrived/skipped — phase advanced. Caller proceeds
|
||||
/// into the next quantum.
|
||||
Advanced,
|
||||
/// Defensive timeout fired before all peers arrived. Caller should
|
||||
/// log + initiate shutdown rather than retry.
|
||||
Timeout,
|
||||
/// Phaser was shut down via [`Phaser::shutdown`]; all waiters are
|
||||
/// woken and return this. Caller exits cleanly.
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Custom barrier-with-skip primitive. Construct once with the number of
|
||||
/// participating slots; share via `Arc` across host threads.
|
||||
pub struct Phaser {
|
||||
/// Total participant count (constant after construction). For our
|
||||
/// scheduler this is `HW_THREAD_COUNT = 6`.
|
||||
party_count: u32,
|
||||
/// Monotonic phase counter, incremented every time the phase
|
||||
/// advances. Used as a generation marker so a slot that wakes "into"
|
||||
/// the next phase doesn't observe the old "everyone arrived" state.
|
||||
phase: AtomicU32,
|
||||
/// Inner state guarded by the condvar's mutex.
|
||||
inner: Mutex<Inner>,
|
||||
/// Notified when a phase advances or shutdown fires.
|
||||
cv: Condvar,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Inner {
|
||||
arrived_or_skipped: u32,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
impl Phaser {
|
||||
/// Create a phaser with `party_count` participants. Panics if
|
||||
/// `party_count == 0`.
|
||||
pub fn new(party_count: u32) -> Self {
|
||||
assert!(party_count > 0, "phaser party_count must be > 0");
|
||||
Self {
|
||||
party_count,
|
||||
phase: AtomicU32::new(0),
|
||||
inner: Mutex::new(Inner {
|
||||
arrived_or_skipped: 0,
|
||||
shutdown: false,
|
||||
}),
|
||||
cv: Condvar::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the current phase number. Useful for tests and observability.
|
||||
pub fn current_phase(&self) -> u32 {
|
||||
self.phase.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
/// Mark this slot as not participating in the current phase. Counts
|
||||
/// toward the advance threshold but does not block. Used when a slot
|
||||
/// has no runnable thread and is parked waiting on
|
||||
/// `slot_wake[i].unpark()`.
|
||||
///
|
||||
/// `_slot_id` is informational (not stored); the parameter exists so
|
||||
/// call sites stay greppable.
|
||||
pub fn skip(&self, _slot_id: u8) {
|
||||
self.contribute_advance();
|
||||
}
|
||||
|
||||
/// Block until the phase advances or the defensive 5-second timeout
|
||||
/// fires. Returns [`PhaserOutcome::Advanced`] on a clean phase
|
||||
/// transition; [`Timeout`] if a peer hung; [`Shutdown`] on tear-down.
|
||||
///
|
||||
/// `_slot_id` is informational (see [`Self::skip`]).
|
||||
pub fn arrive_and_wait(&self, _slot_id: u8) -> PhaserOutcome {
|
||||
self.arrive_and_wait_timeout(Duration::from_secs(5))
|
||||
}
|
||||
|
||||
/// Same as [`Self::arrive_and_wait`] with a caller-supplied timeout.
|
||||
pub fn arrive_and_wait_timeout(&self, timeout: Duration) -> PhaserOutcome {
|
||||
let pre_phase = self.phase.load(Ordering::Acquire);
|
||||
self.contribute_advance();
|
||||
let deadline = Instant::now() + timeout;
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
loop {
|
||||
if guard.shutdown {
|
||||
return PhaserOutcome::Shutdown;
|
||||
}
|
||||
if self.phase.load(Ordering::Acquire) != pre_phase {
|
||||
return PhaserOutcome::Advanced;
|
||||
}
|
||||
let now = Instant::now();
|
||||
if now >= deadline {
|
||||
return PhaserOutcome::Timeout;
|
||||
}
|
||||
let remaining = deadline - now;
|
||||
let result = self.cv.wait_timeout(guard, remaining).unwrap();
|
||||
guard = result.0;
|
||||
if result.1.timed_out() {
|
||||
// Loop once more to disambiguate "real timeout" vs
|
||||
// "spurious wakeup just before the deadline".
|
||||
if self.phase.load(Ordering::Acquire) != pre_phase {
|
||||
return PhaserOutcome::Advanced;
|
||||
}
|
||||
if guard.shutdown {
|
||||
return PhaserOutcome::Shutdown;
|
||||
}
|
||||
return PhaserOutcome::Timeout;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wake every parked arriver and signal shutdown. After this, all
|
||||
/// future and outstanding `arrive_and_wait_*` calls return
|
||||
/// [`PhaserOutcome::Shutdown`].
|
||||
pub fn shutdown(&self) {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.shutdown = true;
|
||||
self.cv.notify_all();
|
||||
}
|
||||
|
||||
/// Common path for both arrive-and-wait and skip: bump the
|
||||
/// participant counter, and if we were the last one in, advance the
|
||||
/// phase + broadcast.
|
||||
fn contribute_advance(&self) {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.arrived_or_skipped += 1;
|
||||
if guard.arrived_or_skipped >= self.party_count {
|
||||
// Last one in. Reset the counter, bump the phase, broadcast.
|
||||
guard.arrived_or_skipped = 0;
|
||||
// `Release` on the phase store pairs with `Acquire` reads in
|
||||
// arriving slots' wait-loop predicates.
|
||||
self.phase.fetch_add(1, Ordering::Release);
|
||||
self.cv.notify_all();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::thread;
|
||||
|
||||
/// All N participants arrive — phase advances, every arriver returns
|
||||
/// `Advanced`.
|
||||
#[test]
|
||||
fn n_arrivers_all_advance() {
|
||||
const N: u32 = 6;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..N {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-test-{i}"))
|
||||
.spawn(move || p.arrive_and_wait(i as u8))
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
|
||||
}
|
||||
assert_eq!(p.current_phase(), 1);
|
||||
}
|
||||
|
||||
/// 5 arrive + 1 skip → phase advances; arrivers see `Advanced`.
|
||||
#[test]
|
||||
fn skip_counts_toward_advance() {
|
||||
const N: u32 = 6;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..(N - 1) {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-arrive-{i}"))
|
||||
.spawn(move || p.arrive_and_wait(i as u8))
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
// Brief pause to let arrivers park first (exercising the
|
||||
// skip-unblocks-arrivers path).
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
p.skip((N - 1) as u8);
|
||||
for h in handles {
|
||||
assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
|
||||
}
|
||||
assert_eq!(p.current_phase(), 1);
|
||||
}
|
||||
|
||||
/// Shutdown wakes parked arrivers; they return `Shutdown`.
|
||||
#[test]
|
||||
fn shutdown_wakes_arrivers() {
|
||||
const N: u32 = 6;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
// Only N-1 arrive — phase will not advance.
|
||||
for i in 0..(N - 1) {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-arrive-shutdown-{i}"))
|
||||
.spawn(move || p.arrive_and_wait(i as u8))
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
p.shutdown();
|
||||
for h in handles {
|
||||
assert_eq!(h.join().unwrap(), PhaserOutcome::Shutdown);
|
||||
}
|
||||
}
|
||||
|
||||
/// Defensive timeout: if some peers never arrive, others surface
|
||||
/// `Timeout` rather than blocking forever.
|
||||
#[test]
|
||||
fn timeout_fires_when_peer_hangs() {
|
||||
const N: u32 = 4;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
// Only 2 of 4 arrive — others "hang".
|
||||
let p1 = p.clone();
|
||||
let h1 = thread::spawn(move || {
|
||||
p1.arrive_and_wait_timeout(Duration::from_millis(50))
|
||||
});
|
||||
let p2 = p.clone();
|
||||
let h2 = thread::spawn(move || {
|
||||
p2.arrive_and_wait_timeout(Duration::from_millis(50))
|
||||
});
|
||||
assert_eq!(h1.join().unwrap(), PhaserOutcome::Timeout);
|
||||
assert_eq!(h2.join().unwrap(), PhaserOutcome::Timeout);
|
||||
}
|
||||
|
||||
/// Multi-phase stress: all participants run a tight loop of
|
||||
/// arrive_and_wait calls; after K phases they all observe the same
|
||||
/// `current_phase()` value. Catches generation/counter resync bugs.
|
||||
#[test]
|
||||
fn multi_phase_progress() {
|
||||
const N: u32 = 6;
|
||||
const K: u32 = 1000;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let counter = Arc::new(AtomicU32::new(0));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..N {
|
||||
let p = p.clone();
|
||||
let c = counter.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-multi-{i}"))
|
||||
.spawn(move || {
|
||||
for _ in 0..K {
|
||||
assert_eq!(
|
||||
p.arrive_and_wait(i as u8),
|
||||
PhaserOutcome::Advanced
|
||||
);
|
||||
}
|
||||
c.fetch_add(1, Ordering::Relaxed);
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
h.join().unwrap();
|
||||
}
|
||||
assert_eq!(counter.load(Ordering::Relaxed), N);
|
||||
assert_eq!(p.current_phase(), K);
|
||||
}
|
||||
|
||||
/// Mixed skip/arrive across phases — emulates the realistic scheduler
|
||||
/// pattern where slots become idle for some quanta.
|
||||
#[test]
|
||||
fn mixed_skip_and_arrive_random() {
|
||||
const N: u32 = 6;
|
||||
const K: u32 = 200;
|
||||
let p = Arc::new(Phaser::new(N));
|
||||
let mut handles = Vec::new();
|
||||
for i in 0..N {
|
||||
let p = p.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("phaser-mixed-{i}"))
|
||||
.spawn(move || {
|
||||
// Pseudo-random skip pattern based on slot+phase
|
||||
let mut state: u32 = 0x9E37_79B9u32.wrapping_add(i);
|
||||
for phase in 0..K {
|
||||
state = state.wrapping_mul(0x6C8E_9CF7).wrapping_add(phase);
|
||||
if state & 0xF == 0 {
|
||||
p.skip(i as u8);
|
||||
} else {
|
||||
let _ = p.arrive_and_wait(i as u8);
|
||||
}
|
||||
}
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
h.join().unwrap();
|
||||
}
|
||||
// After K rounds with all-N participation each phase, the phase
|
||||
// counter equals K. Each iteration contributes exactly N to the
|
||||
// counter (split between arrive and skip).
|
||||
assert_eq!(p.current_phase(), K);
|
||||
}
|
||||
}
|
||||
424
crates/xenia-cpu/src/reservation.rs
Normal file
424
crates/xenia-cpu/src/reservation.rs
Normal file
@@ -0,0 +1,424 @@
|
||||
//! Inter-thread reservation table for `lwarx`/`stwcx.` and
|
||||
//! `ldarx`/`stdcx.`.
|
||||
//!
|
||||
//! On real Xenon, each core's `lwarx` places a reservation on a 128-byte
|
||||
//! cache line; any other CPU's store to the line invalidates the
|
||||
//! reservation. `stwcx.`'s success depends on the reservation still being
|
||||
//! valid. Under M3's per-HW-thread parallelism, we need an inter-thread
|
||||
//! mechanism for the same guarantee.
|
||||
//!
|
||||
//! M2 introduces the table behind a runtime `reservations_enabled` flag
|
||||
//! (default `false`). When the flag is `false`, the interpreter's
|
||||
//! existing per-`PpcContext` `reserved_line`/`has_reservation` fields are
|
||||
//! used as-is — no inter-thread tracking. M3 flips the flag on once the
|
||||
//! per-HW-thread host threads are spawning.
|
||||
//!
|
||||
//! ## Design
|
||||
//!
|
||||
//! - **Banked AtomicU64 array** of [`NUM_LINES`] entries (4096 × 8 B =
|
||||
//! 32 KiB total). Each entry packs `(line_address, generation,
|
||||
//! hw_id)`. A zero value means "no reservation on this bank".
|
||||
//! - **Hash function**: `(line >> 7) & (NUM_LINES - 1)`. Different lines
|
||||
//! that map to the same bank conservatively invalidate each other's
|
||||
//! reservations — sound (real Xenon's L2 has finite associativity and
|
||||
//! has the same property), at the cost of slightly more `stwcx.`
|
||||
//! failures than a perfect-mapping table would produce.
|
||||
//! - **`active_reservers: AtomicU16`** — a fast-path counter
|
||||
//! incremented by every `lwarx` and decremented when its reservation is
|
||||
//! either committed or invalidated. `write_u32` checks this with a
|
||||
//! single `Relaxed` load; when zero (the common case in code that
|
||||
//! doesn't use atomics), the invalidation hook is a one-instruction
|
||||
//! skip.
|
||||
//! - **Generation counter**: monotonic across all reservations,
|
||||
//! incremented atomically. 24 bits of generation packed in the slot
|
||||
//! means 16 M reuses per slot before wraparound; at multi-million
|
||||
//! reservations/sec sustained that's still many seconds, and a
|
||||
//! stale-gen `stwcx.` simply fails (sound, not livelocking).
|
||||
//!
|
||||
//! ## Invariants
|
||||
//!
|
||||
//! 1. A `stwcx.(addr)` succeeds only if the line slot still holds the
|
||||
//! same `(line, gen, hw_id)` triple the reserver stamped at `lwarx`.
|
||||
//! 2. Any plain store to a reserved line invalidates it (slot CASed to
|
||||
//! zero). Hash-collision side-effect: a store to a different line
|
||||
//! that maps to the same bank also invalidates — guests that observe
|
||||
//! a `stwcx.` failure simply retry, so this is correctness-preserving.
|
||||
//! 3. `stwcx.` from a different `hw_id` than the reserver fails even if
|
||||
//! the line and gen would otherwise match — only the originating HW
|
||||
//! thread can commit its own reservation.
|
||||
//!
|
||||
//! Memory ordering: all CAS / store operations on the line slot use
|
||||
//! `AcqRel`; readers use `Acquire`. The store inside `stwcx.`'s payload
|
||||
//! itself (the actual data write) is the caller's responsibility — see
|
||||
//! [`crate::interpreter`]'s `stwcx.` arm.
|
||||
|
||||
use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
|
||||
|
||||
/// Real Xenon L2 cache-line size — the granule a reservation covers.
|
||||
pub const LINE_BYTES: u32 = 0x80;
|
||||
/// Mask to align an address to a cache-line boundary.
|
||||
pub const LINE_MASK: u32 = !(LINE_BYTES - 1);
|
||||
/// Number of bank entries in the reservation table. Power of two so the
|
||||
/// hash is a single AND. 32 KiB total at 8 B per entry.
|
||||
pub const NUM_LINES: usize = 4096;
|
||||
const HASH_MASK: u32 = (NUM_LINES as u32) - 1;
|
||||
|
||||
/// Pack `(line_addr, generation, hw_id)` into a single u64. The packed
|
||||
/// layout is:
|
||||
/// bits 63..32: line address (we only need the high bits since the
|
||||
/// low 7 are always zero — reserved range is line-aligned)
|
||||
/// bits 31..8: 24-bit generation
|
||||
/// bits 7..0: 8-bit `hw_id`
|
||||
///
|
||||
/// A packed value of `0` means "no reservation". Since we never reserve
|
||||
/// on guest virtual address `0` (the page is unmapped) and the
|
||||
/// generation increments from `1`, zero is a safe sentinel.
|
||||
#[inline]
|
||||
pub fn pack(line_addr: u32, generation: u32, hw_id: u8) -> u64 {
|
||||
debug_assert!(line_addr & !LINE_MASK == 0, "line_addr must be line-aligned");
|
||||
debug_assert!(generation < (1 << 24), "generation must fit in 24 bits");
|
||||
((line_addr as u64) << 32)
|
||||
| ((generation as u64 & 0xFF_FFFF) << 8)
|
||||
| (hw_id as u64)
|
||||
}
|
||||
|
||||
/// Inverse of [`pack`]. Returns `None` if the value is the zero sentinel
|
||||
/// (no reservation).
|
||||
#[inline]
|
||||
pub fn unpack(raw: u64) -> Option<(u32, u32, u8)> {
|
||||
if raw == 0 {
|
||||
return None;
|
||||
}
|
||||
let line = (raw >> 32) as u32;
|
||||
let generation = ((raw >> 8) & 0xFF_FFFF) as u32;
|
||||
let hw_id = (raw & 0xFF) as u8;
|
||||
Some((line, generation, hw_id))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn hash(line_addr: u32) -> usize {
|
||||
((line_addr >> 7) & HASH_MASK) as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn align_to_line(addr: u32) -> u32 {
|
||||
addr & LINE_MASK
|
||||
}
|
||||
|
||||
/// Banked reservation table shared across all emulated HW threads. Built
|
||||
/// once per emulation instance; lives behind an `Arc` so worker host
|
||||
/// threads (M3) can hold their own clones without lifetime gymnastics.
|
||||
pub struct ReservationTable {
|
||||
lines: Vec<AtomicU64>,
|
||||
active_reservers: AtomicU16,
|
||||
next_gen: AtomicU64,
|
||||
/// Runtime activation flag. Default `false`. M2.8's
|
||||
/// `--reservations-table` flag (or M3 spawn) flips this to `true`,
|
||||
/// at which point the interpreter's `lwarx`/`stwcx.` arms route
|
||||
/// through the table; otherwise they use the legacy per-`PpcContext`
|
||||
/// reservation fields.
|
||||
enabled: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
impl Default for ReservationTable {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ReservationTable {
|
||||
/// Construct a fresh table with all banks empty.
|
||||
pub fn new() -> Self {
|
||||
let mut lines = Vec::with_capacity(NUM_LINES);
|
||||
for _ in 0..NUM_LINES {
|
||||
lines.push(AtomicU64::new(0));
|
||||
}
|
||||
Self {
|
||||
lines,
|
||||
active_reservers: AtomicU16::new(0),
|
||||
// Start at 1 so the very first reservation gets a non-zero
|
||||
// gen and the packed slot value is non-zero (zero is the
|
||||
// "no reservation" sentinel).
|
||||
next_gen: AtomicU64::new(1),
|
||||
enabled: std::sync::atomic::AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
/// Activate the table. The interpreter's `lwarx`/`stwcx.` arms will
|
||||
/// route through this table on subsequent dispatches. Idempotent.
|
||||
pub fn enable(&self) {
|
||||
self.enabled
|
||||
.store(true, std::sync::atomic::Ordering::Release);
|
||||
}
|
||||
|
||||
/// Deactivate the table. The interpreter falls back to per-`PpcContext`
|
||||
/// reservation fields. Idempotent.
|
||||
pub fn disable(&self) {
|
||||
self.enabled
|
||||
.store(false, std::sync::atomic::Ordering::Release);
|
||||
}
|
||||
|
||||
/// Whether the table is currently active. The interpreter consults
|
||||
/// this on every `lwarx`/`stwcx.` to decide which path runs.
|
||||
pub fn is_enabled(&self) -> bool {
|
||||
self.enabled.load(std::sync::atomic::Ordering::Acquire)
|
||||
}
|
||||
|
||||
/// True when at least one reservation is currently outstanding.
|
||||
/// Plain `write_u32` consults this to skip the invalidation hook
|
||||
/// when no thread holds a reservation — the common case for
|
||||
/// non-atomic code.
|
||||
#[inline]
|
||||
pub fn has_active_reservers(&self) -> bool {
|
||||
self.active_reservers.load(Ordering::Relaxed) > 0
|
||||
}
|
||||
|
||||
/// `lwarx(addr)` — claim a reservation on the line containing `addr`.
|
||||
/// Returns the generation stamped into the slot; the interpreter
|
||||
/// stores this alongside the per-`PpcContext` `has_reservation` bit
|
||||
/// so a subsequent `stwcx.` can verify the same gen still holds.
|
||||
///
|
||||
/// If a different reservation already occupied the bank, it's
|
||||
/// silently overwritten — that thread's `stwcx.` will fail because
|
||||
/// the slot no longer matches its stamped gen. Matches Xenon
|
||||
/// behavior (a different core's lwarx on the same line displaces
|
||||
/// any prior reservation).
|
||||
pub fn reserve(&self, addr: u32, hw_id: u8) -> u32 {
|
||||
let line = align_to_line(addr);
|
||||
let generation = (self
|
||||
.next_gen
|
||||
.fetch_add(1, Ordering::Relaxed)
|
||||
& 0xFF_FFFF) as u32;
|
||||
let new_raw = pack(line, generation, hw_id);
|
||||
// Release: prior reads of the reservation target should
|
||||
// happen-before any thread that observes the new slot value.
|
||||
let prev = self.lines[hash(line)].swap(new_raw, Ordering::AcqRel);
|
||||
// If the previous slot was non-zero, the displaced reserver is
|
||||
// implicitly invalidated — decrement the active counter for it.
|
||||
// Else, increment for our new reservation. Net effect: the
|
||||
// counter equals the number of *bank slots* with a non-zero
|
||||
// value, which is an upper bound on actual reservers.
|
||||
if prev == 0 {
|
||||
self.active_reservers.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
generation
|
||||
}
|
||||
|
||||
/// `stwcx.(addr)` — try to commit a reservation. Returns `true` if
|
||||
/// the slot still holds `(line, my_gen, my_hw_id)` (in which case
|
||||
/// it's CAS'd back to zero, releasing the bank), `false` otherwise.
|
||||
/// The data store itself is the caller's responsibility — see
|
||||
/// [`crate::interpreter`]'s `stwcx.` arm.
|
||||
pub fn try_commit(&self, addr: u32, my_gen: u32, my_hw_id: u8) -> bool {
|
||||
let line = align_to_line(addr);
|
||||
let expected = pack(line, my_gen, my_hw_id);
|
||||
match self.lines[hash(line)].compare_exchange(
|
||||
expected,
|
||||
0,
|
||||
Ordering::AcqRel,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => {
|
||||
// Successfully released the slot; decrement the active
|
||||
// count.
|
||||
self.active_reservers.fetch_sub(1, Ordering::Relaxed);
|
||||
true
|
||||
}
|
||||
Err(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Hook for plain (non-reserving) stores: invalidate any
|
||||
/// reservation on the containing line. Cheap when the bank is
|
||||
/// already empty (single Acquire load + branch).
|
||||
pub fn invalidate_for_write(&self, addr: u32) {
|
||||
let line = align_to_line(addr);
|
||||
let bank = &self.lines[hash(line)];
|
||||
let prev = bank.load(Ordering::Acquire);
|
||||
if prev == 0 {
|
||||
return;
|
||||
}
|
||||
// Verify the slot still holds a reservation on *this* line
|
||||
// before clearing — hash collisions mean the bank may hold a
|
||||
// reservation on an unrelated line that maps to the same slot.
|
||||
// Real Xenon has the same property (limited L2 associativity);
|
||||
// we mirror it here. A spurious bank match invalidates a
|
||||
// different line's reservation; the affected `stwcx.` retries —
|
||||
// sound, slightly less efficient.
|
||||
if let Some((bank_line, _generation, _hw)) = unpack(prev) {
|
||||
if bank_line != line {
|
||||
// Different line in the same bank — leave it alone (we
|
||||
// chose not to invalidate cross-line collisions to
|
||||
// reduce false-fail noise; real-HW behavior is similar
|
||||
// since L2 associativity sets cross-line constraints).
|
||||
return;
|
||||
}
|
||||
}
|
||||
// CAS-clear the bank if it still holds the value we observed.
|
||||
// If a concurrent `stwcx.` or `reserve` raced with us, the CAS
|
||||
// fails — that's fine; the line slot is now in a different
|
||||
// state and the displaced reservation will be picked up there.
|
||||
if bank
|
||||
.compare_exchange(prev, 0, Ordering::AcqRel, Ordering::Relaxed)
|
||||
.is_ok()
|
||||
{
|
||||
self.active_reservers.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop a per-`PpcContext` reservation without committing. Called
|
||||
/// when the interpreter clears `has_reservation` due to a
|
||||
/// non-`stwcx.` event (context switch, exception, etc.). Safe to
|
||||
/// call when the table doesn't hold our reservation anymore (the
|
||||
/// CAS simply fails).
|
||||
pub fn release(&self, addr: u32, my_gen: u32, my_hw_id: u8) {
|
||||
let _ = self.try_commit(addr, my_gen, my_hw_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
#[test]
|
||||
fn pack_unpack_roundtrip() {
|
||||
let raw = pack(0x1000_0000, 42, 5);
|
||||
let (line, generation, hw) = unpack(raw).unwrap();
|
||||
assert_eq!(line, 0x1000_0000);
|
||||
assert_eq!(generation, 42);
|
||||
assert_eq!(hw, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unpack_zero_is_none() {
|
||||
assert!(unpack(0).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reserve_then_commit_succeeds() {
|
||||
let t = ReservationTable::new();
|
||||
let gn = t.reserve(0x1234, 0);
|
||||
assert!(t.try_commit(0x1234, gn, 0));
|
||||
// Already released — second commit fails.
|
||||
assert!(!t.try_commit(0x1234, gn, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn other_hw_id_cannot_commit() {
|
||||
let t = ReservationTable::new();
|
||||
let gn = t.reserve(0x1234, 0);
|
||||
assert!(
|
||||
!t.try_commit(0x1234, gn, 1),
|
||||
"stwcx. from a different hw_id must fail"
|
||||
);
|
||||
// Original owner can still commit.
|
||||
assert!(t.try_commit(0x1234, gn, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lwarx_displaces_prior_reservation() {
|
||||
let t = ReservationTable::new();
|
||||
let g0 = t.reserve(0x1234, 0);
|
||||
// Different HW thread's lwarx on the same line.
|
||||
let g1 = t.reserve(0x1234, 1);
|
||||
// Original reserver's stwcx. fails because the gen changed.
|
||||
assert!(!t.try_commit(0x1234, g0, 0));
|
||||
// New reserver's stwcx. succeeds.
|
||||
assert!(t.try_commit(0x1234, g1, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalidate_clears_matching_reservation() {
|
||||
let t = ReservationTable::new();
|
||||
let gn = t.reserve(0x1234, 0);
|
||||
t.invalidate_for_write(0x1238); // same line as 0x1234
|
||||
assert!(!t.try_commit(0x1234, gn, 0));
|
||||
assert_eq!(t.active_reservers.load(Ordering::Relaxed), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalidate_different_line_in_same_bank_is_noop() {
|
||||
let t = ReservationTable::new();
|
||||
// Force a hash collision: addr A and addr B with same hash but
|
||||
// different line addresses.
|
||||
let line_a = 0x0000_1000;
|
||||
let line_b = line_a + ((NUM_LINES as u32) << 7); // +0x80000 → same hash
|
||||
assert_eq!(hash(line_a), hash(line_b));
|
||||
let gn = t.reserve(line_a, 0);
|
||||
// Invalidating line_b must NOT clear line_a's reservation.
|
||||
t.invalidate_for_write(line_b);
|
||||
assert!(t.try_commit(line_a, gn, 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn has_active_reservers_tracks_count() {
|
||||
let t = ReservationTable::new();
|
||||
assert!(!t.has_active_reservers());
|
||||
let g0 = t.reserve(0x1000, 0);
|
||||
assert!(t.has_active_reservers());
|
||||
let g1 = t.reserve(0x2000, 1);
|
||||
assert!(t.has_active_reservers());
|
||||
t.try_commit(0x1000, g0, 0);
|
||||
assert!(t.has_active_reservers());
|
||||
t.try_commit(0x2000, g1, 1);
|
||||
assert!(!t.has_active_reservers());
|
||||
}
|
||||
|
||||
/// Stress test: 8 host threads each loop reserve+stwcx on the same
|
||||
/// line. Exactly one stwcx per round can win; the others fail and
|
||||
/// retry. The total number of *successful* commits across N
|
||||
/// outer iterations equals N (one winner per round).
|
||||
///
|
||||
/// This proves the table's mutual-exclusion property: at most one
|
||||
/// thread's stwcx. on a given line can succeed between two events
|
||||
/// that would invalidate the line.
|
||||
#[test]
|
||||
fn concurrent_lwarx_stwcx_serializes() {
|
||||
let t = Arc::new(ReservationTable::new());
|
||||
const ROUNDS: u32 = 1000;
|
||||
const THREADS: u8 = 8;
|
||||
let total_successes = Arc::new(AtomicU64::new(0));
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for hw_id in 0..THREADS {
|
||||
let t_clone = t.clone();
|
||||
let s_clone = total_successes.clone();
|
||||
handles.push(
|
||||
thread::Builder::new()
|
||||
.name(format!("res-stress-{hw_id}"))
|
||||
.spawn(move || {
|
||||
let mut wins = 0u64;
|
||||
for _ in 0..ROUNDS {
|
||||
let gn = t_clone.reserve(0x1234_5678, hw_id);
|
||||
if t_clone.try_commit(0x1234_5678, gn, hw_id) {
|
||||
wins += 1;
|
||||
}
|
||||
}
|
||||
s_clone.fetch_add(wins, Ordering::Relaxed);
|
||||
})
|
||||
.expect("spawn"),
|
||||
);
|
||||
}
|
||||
for h in handles {
|
||||
h.join().expect("join");
|
||||
}
|
||||
let total = total_successes.load(Ordering::Relaxed);
|
||||
// Lower bound: every round had at least one winner — but races
|
||||
// can cause some rounds to have zero (all threads' reservations
|
||||
// got displaced before any could commit). Assert progress: at
|
||||
// least 10% of attempts succeed, and active_reservers is back
|
||||
// to zero.
|
||||
let attempts = ROUNDS as u64 * THREADS as u64;
|
||||
assert!(
|
||||
total > attempts / 10,
|
||||
"expected at least 10% successful commits, got {total}/{attempts}"
|
||||
);
|
||||
assert_eq!(
|
||||
t.active_reservers.load(Ordering::Relaxed),
|
||||
0,
|
||||
"all reservations should have been resolved"
|
||||
);
|
||||
}
|
||||
}
|
||||
1919
crates/xenia-cpu/src/scheduler.rs
Normal file
1919
crates/xenia-cpu/src/scheduler.rs
Normal file
File diff suppressed because it is too large
Load Diff
95
crates/xenia-cpu/src/trap.rs
Normal file
95
crates/xenia-cpu/src/trap.rs
Normal file
@@ -0,0 +1,95 @@
|
||||
//! TO-field evaluation for `tw`, `twi`, `td`, `tdi`.
|
||||
//!
|
||||
//! The TO field (5 bits) encodes which comparison outcomes trigger a trap:
|
||||
//!
|
||||
//! | bit | condition |
|
||||
//! |-----|-----------|
|
||||
//! | 0 | a < b (signed) |
|
||||
//! | 1 | a > b (signed) |
|
||||
//! | 2 | a == b |
|
||||
//! | 3 | a < b (unsigned) |
|
||||
//! | 4 | a > b (unsigned) |
|
||||
//!
|
||||
//! The bit numbering matches PowerISA ("MSB is bit 0"): TO[0] corresponds to
|
||||
//! the high bit of the 5-bit field, i.e. (to >> 4) & 1.
|
||||
//!
|
||||
//! `tw` / `twi` compare the low 32 bits of the operands (sign-extended back to
|
||||
//! 64 for the signed comparison); `td` / `tdi` compare the full 64 bits.
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum TrapWidth {
|
||||
Word, // tw, twi: 32-bit
|
||||
Doubleword, // td, tdi: 64-bit
|
||||
}
|
||||
|
||||
const TO_SLT: u32 = 1 << 4; // a < b signed
|
||||
const TO_SGT: u32 = 1 << 3; // a > b signed
|
||||
const TO_EQ: u32 = 1 << 2; // a == b
|
||||
const TO_ULT: u32 = 1 << 1; // a < b unsigned
|
||||
const TO_UGT: u32 = 1 << 0; // a > b unsigned
|
||||
|
||||
/// Returns true when the trap should fire.
|
||||
pub fn evaluate(to: u32, a: u64, b: u64, width: TrapWidth) -> bool {
|
||||
let (sa, sb, ua, ub): (i64, i64, u64, u64) = match width {
|
||||
TrapWidth::Word => (
|
||||
a as i32 as i64,
|
||||
b as i32 as i64,
|
||||
a as u32 as u64,
|
||||
b as u32 as u64,
|
||||
),
|
||||
TrapWidth::Doubleword => (a as i64, b as i64, a, b),
|
||||
};
|
||||
|
||||
if (to & TO_SLT) != 0 && sa < sb { return true; }
|
||||
if (to & TO_SGT) != 0 && sa > sb { return true; }
|
||||
if (to & TO_EQ) != 0 && ua == ub { return true; }
|
||||
if (to & TO_ULT) != 0 && ua < ub { return true; }
|
||||
if (to & TO_UGT) != 0 && ua > ub { return true; }
|
||||
false
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn to_zero_never_traps() {
|
||||
assert!(!evaluate(0, 0, 0, TrapWidth::Doubleword));
|
||||
assert!(!evaluate(0, 5, 3, TrapWidth::Doubleword));
|
||||
assert!(!evaluate(0, !0, 0, TrapWidth::Doubleword));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_31_always_traps_when_any_condition_holds() {
|
||||
// 31 = 0b11111 = all conditions enabled
|
||||
assert!(evaluate(31, 1, 2, TrapWidth::Doubleword)); // slt+ult
|
||||
assert!(evaluate(31, 2, 1, TrapWidth::Doubleword)); // sgt+ugt
|
||||
assert!(evaluate(31, 7, 7, TrapWidth::Doubleword)); // eq
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_eq_only() {
|
||||
// TO[2] = 0b00100 = 4
|
||||
assert!(evaluate(4, 5, 5, TrapWidth::Doubleword));
|
||||
assert!(!evaluate(4, 5, 6, TrapWidth::Doubleword));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn to_signed_vs_unsigned_on_negative() {
|
||||
// a=-1 (as u64 = all-ones). TO[0]=slt enabled = 0b10000 = 16
|
||||
// Signed: -1 < 0 → true
|
||||
let neg1 = (-1i64) as u64;
|
||||
assert!(evaluate(16, neg1, 0, TrapWidth::Doubleword));
|
||||
// TO[3]=ult enabled = 0b00010 = 2 → unsigned: all-ones < 0 is false
|
||||
assert!(!evaluate(2, neg1, 0, TrapWidth::Doubleword));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn word_width_ignores_high_32_bits() {
|
||||
// a's low 32 = 1, high 32 = different; b = 1. With TO=eq, should trap.
|
||||
let a = 0xDEAD_BEEF_0000_0001u64;
|
||||
assert!(evaluate(4, a, 1, TrapWidth::Word));
|
||||
// In doubleword, different.
|
||||
assert!(!evaluate(4, a, 1, TrapWidth::Doubleword));
|
||||
}
|
||||
}
|
||||
918
crates/xenia-cpu/src/vmx.rs
Normal file
918
crates/xenia-cpu/src/vmx.rs
Normal file
@@ -0,0 +1,918 @@
|
||||
//! VMX / AltiVec helper routines shared by the interpreter's 150+ vector
|
||||
//! opcode handlers.
|
||||
//!
|
||||
//! Big-endian lane indexing throughout: `Vec128::bytes[0]` is the most
|
||||
//! significant byte, which corresponds to PowerPC lane 0. Operations that
|
||||
//! care about "even" vs "odd" lanes follow the PPC convention (lane 0 = most
|
||||
//! significant = "even" for multiply-even/odd purposes).
|
||||
|
||||
use xenia_memory::MemoryAccess;
|
||||
use xenia_types::Vec128;
|
||||
|
||||
// ─── Lane accessors ────────────────────────────────────────────────────────
|
||||
|
||||
#[inline] pub fn as_i8x16(v: Vec128) -> [i8; 16] {
|
||||
let b = v.as_bytes();
|
||||
let mut r = [0i8; 16];
|
||||
for i in 0..16 { r[i] = b[i] as i8; }
|
||||
r
|
||||
}
|
||||
|
||||
#[inline] pub fn as_i16x8(v: Vec128) -> [i16; 8] {
|
||||
let u = v.as_u16x8();
|
||||
[u[0] as i16, u[1] as i16, u[2] as i16, u[3] as i16,
|
||||
u[4] as i16, u[5] as i16, u[6] as i16, u[7] as i16]
|
||||
}
|
||||
|
||||
#[inline] pub fn as_i32x4(v: Vec128) -> [i32; 4] {
|
||||
let u = v.as_u32x4();
|
||||
[u[0] as i32, u[1] as i32, u[2] as i32, u[3] as i32]
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i8x16(r: [i8; 16]) -> Vec128 {
|
||||
let mut b = [0u8; 16];
|
||||
for i in 0..16 { b[i] = r[i] as u8; }
|
||||
Vec128::from_bytes(b)
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i16x8(r: [i16; 8]) -> Vec128 {
|
||||
Vec128::from_u16x8_array([
|
||||
r[0] as u16, r[1] as u16, r[2] as u16, r[3] as u16,
|
||||
r[4] as u16, r[5] as u16, r[6] as u16, r[7] as u16,
|
||||
])
|
||||
}
|
||||
|
||||
#[inline] pub fn from_i32x4(r: [i32; 4]) -> Vec128 {
|
||||
Vec128::from_u32x4_array([r[0] as u32, r[1] as u32, r[2] as u32, r[3] as u32])
|
||||
}
|
||||
|
||||
// ─── Saturation helpers ────────────────────────────────────────────────────
|
||||
// Each returns (clamped_value, saturated_flag). Handlers OR the flags together
|
||||
// and call `ctx.set_vscr_sat(true)` once per instruction.
|
||||
|
||||
#[inline] pub fn sat_add_u8(a: u8, b: u8) -> (u8, bool) {
|
||||
let s = a as u16 + b as u16;
|
||||
if s > u8::MAX as u16 { (u8::MAX, true) } else { (s as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u8(a: u8, b: u8) -> (u8, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i8(a: i8, b: i8) -> (i8, bool) {
|
||||
let s = a as i16 + b as i16;
|
||||
if s > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if s < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (s as i8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i8(a: i8, b: i8) -> (i8, bool) {
|
||||
let s = a as i16 - b as i16;
|
||||
if s > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if s < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (s as i8, false) }
|
||||
}
|
||||
|
||||
#[inline] pub fn sat_add_u16(a: u16, b: u16) -> (u16, bool) {
|
||||
let s = a as u32 + b as u32;
|
||||
if s > u16::MAX as u32 { (u16::MAX, true) } else { (s as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u16(a: u16, b: u16) -> (u16, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i16(a: i16, b: i16) -> (i16, bool) {
|
||||
let s = a as i32 + b as i32;
|
||||
if s > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if s < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (s as i16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i16(a: i16, b: i16) -> (i16, bool) {
|
||||
let s = a as i32 - b as i32;
|
||||
if s > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if s < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (s as i16, false) }
|
||||
}
|
||||
|
||||
#[inline] pub fn sat_add_u32(a: u32, b: u32) -> (u32, bool) {
|
||||
let s = a as u64 + b as u64;
|
||||
if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_u32(a: u32, b: u32) -> (u32, bool) {
|
||||
if a >= b { (a - b, false) } else { (0, true) }
|
||||
}
|
||||
#[inline] pub fn sat_add_i32(a: i32, b: i32) -> (i32, bool) {
|
||||
let s = a as i64 + b as i64;
|
||||
if s > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if s < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (s as i32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_sub_i32(a: i32, b: i32) -> (i32, bool) {
|
||||
let s = a as i64 - b as i64;
|
||||
if s > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if s < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (s as i32, false) }
|
||||
}
|
||||
|
||||
// Pack-with-saturation helpers — clamp a wider integer to the narrower type.
|
||||
#[inline] pub fn sat_i16_to_i8(v: i16) -> (i8, bool) {
|
||||
if v > i8::MAX as i16 { (i8::MAX, true) }
|
||||
else if v < i8::MIN as i16 { (i8::MIN, true) }
|
||||
else { (v as i8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i16_to_u8(v: i16) -> (u8, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u8::MAX as i16 { (u8::MAX, true) }
|
||||
else { (v as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_u16_to_u8(v: u16) -> (u8, bool) {
|
||||
if v > u8::MAX as u16 { (u8::MAX, true) } else { (v as u8, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i32_to_i16(v: i32) -> (i16, bool) {
|
||||
if v > i16::MAX as i32 { (i16::MAX, true) }
|
||||
else if v < i16::MIN as i32 { (i16::MIN, true) }
|
||||
else { (v as i16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i32_to_u16(v: i32) -> (u16, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u16::MAX as i32 { (u16::MAX, true) }
|
||||
else { (v as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_u32_to_u16(v: u32) -> (u16, bool) {
|
||||
if v > u16::MAX as u32 { (u16::MAX, true) } else { (v as u16, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i64_to_i32(v: i64) -> (i32, bool) {
|
||||
if v > i32::MAX as i64 { (i32::MAX, true) }
|
||||
else if v < i32::MIN as i64 { (i32::MIN, true) }
|
||||
else { (v as i32, false) }
|
||||
}
|
||||
#[inline] pub fn sat_i64_to_u32(v: i64) -> (u32, bool) {
|
||||
if v < 0 { (0, true) }
|
||||
else if v > u32::MAX as i64 { (u32::MAX, true) }
|
||||
else { (v as u32, false) }
|
||||
}
|
||||
|
||||
// ─── Averages ──────────────────────────────────────────────────────────────
|
||||
// PPC avg is rounded up: (a + b + 1) / 2.
|
||||
#[inline] pub fn avg_u8(a: u8, b: u8) -> u8 {
|
||||
((a as u16 + b as u16 + 1) >> 1) as u8
|
||||
}
|
||||
#[inline] pub fn avg_u16(a: u16, b: u16) -> u16 {
|
||||
((a as u32 + b as u32 + 1) >> 1) as u16
|
||||
}
|
||||
#[inline] pub fn avg_u32(a: u32, b: u32) -> u32 {
|
||||
((a as u64 + b as u64 + 1) >> 1) as u32
|
||||
}
|
||||
#[inline] pub fn avg_i8(a: i8, b: i8) -> i8 {
|
||||
((a as i32 + b as i32 + 1) >> 1) as i8
|
||||
}
|
||||
#[inline] pub fn avg_i16(a: i16, b: i16) -> i16 {
|
||||
((a as i32 + b as i32 + 1) >> 1) as i16
|
||||
}
|
||||
#[inline] pub fn avg_i32(a: i32, b: i32) -> i32 {
|
||||
((a as i64 + b as i64 + 1) >> 1) as i32
|
||||
}
|
||||
|
||||
// ─── NaN-aware f32 min/max for vmaxfp / vminfp ────────────────────────────
|
||||
//
|
||||
// Altivec PEM: "If either element of vA or vB is a NaN, the corresponding
|
||||
// element of vD is set to the quiet NaN form of that NaN". Rust's `>` / `<`
|
||||
// comparison with NaN always returns false, so `if a > b { a } else { b }`
|
||||
// would silently pick `b` whenever `a` is NaN — losing NaN propagation.
|
||||
|
||||
#[inline]
|
||||
pub fn max_nan(a: f32, b: f32) -> f32 {
|
||||
if a.is_nan() { quiet_nan(a) }
|
||||
else if b.is_nan() { quiet_nan(b) }
|
||||
else if a > b { a } else { b }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn min_nan(a: f32, b: f32) -> f32 {
|
||||
if a.is_nan() { quiet_nan(a) }
|
||||
else if b.is_nan() { quiet_nan(b) }
|
||||
else if a < b { a } else { b }
|
||||
}
|
||||
|
||||
/// Convert an SNaN to QNaN by setting the high mantissa bit. A QNaN is
|
||||
/// returned unchanged.
|
||||
#[inline]
|
||||
pub fn quiet_nan(x: f32) -> f32 {
|
||||
if !x.is_nan() { return x; }
|
||||
f32::from_bits(x.to_bits() | 0x0040_0000)
|
||||
}
|
||||
|
||||
/// Flush a subnormal f32 to ±0 (preserving the sign). Used by vmaddfp family,
|
||||
/// vctsxs / vctuxs, and any instruction whose AltiVec definition specifies
|
||||
/// input-side denormal flushing regardless of VSCR[NJ].
|
||||
#[inline]
|
||||
pub fn flush_denorm(x: f32) -> f32 {
|
||||
if x.is_subnormal() {
|
||||
if x.is_sign_negative() { -0.0 } else { 0.0 }
|
||||
} else {
|
||||
x
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Float ⇄ fixed-point conversions (scaled by 2^scale_bits) ─────────────
|
||||
//
|
||||
// vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
|
||||
#[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
|
||||
if x.is_nan() { return (0, true); }
|
||||
let x = flush_denorm(x);
|
||||
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
|
||||
if scaled >= i32::MAX as f64 { return (i32::MAX, true); }
|
||||
if scaled <= i32::MIN as f64 { return (i32::MIN, true); }
|
||||
(scaled.trunc() as i32, false)
|
||||
}
|
||||
#[inline] pub fn cvt_f32_to_u32_sat(x: f32, scale_bits: u32) -> (u32, bool) {
|
||||
if x.is_nan() { return (0, true); }
|
||||
let x = flush_denorm(x);
|
||||
let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
|
||||
if scaled < 0.0 { return (0, true); }
|
||||
if scaled > u32::MAX as f64 { return (u32::MAX, true); }
|
||||
(scaled.trunc() as u32, false)
|
||||
}
|
||||
#[inline] pub fn cvt_i32_to_f32(v: i32, scale_bits: u32) -> f32 {
|
||||
(v as f64 / (1u64 << scale_bits) as f64) as f32
|
||||
}
|
||||
#[inline] pub fn cvt_u32_to_f32(v: u32, scale_bits: u32) -> f32 {
|
||||
(v as f64 / (1u64 << scale_bits) as f64) as f32
|
||||
}
|
||||
|
||||
// ─── Unaligned vector load/store ──────────────────────────────────────────
|
||||
//
|
||||
// lvlx/lvrx and stvlx/stvrx combine to perform any unaligned 16-byte access:
|
||||
// lvlx(EA) | lvrx(EA + 16) loads 16 bytes starting at unaligned EA.
|
||||
// stvlx(EA); stvrx(EA + 16) stores 16 bytes starting at unaligned EA.
|
||||
//
|
||||
// Semantics per the AltiVec manual (and xenia-canary ppc_emit_memory.cc):
|
||||
// lvlx: shift = EA & 0xF, n = 16 - shift. Loads mem[EA..EA+n] into
|
||||
// lanes VR[0..n], zeros VR[n..16].
|
||||
// lvrx: shift = EA & 0xF. If shift == 0, VR = 0. Otherwise loads
|
||||
// mem[EA-shift..EA] into lanes VR[16-shift..16], zeros VR[0..16-shift].
|
||||
// stvlx / stvrx are the symmetric stores.
|
||||
//
|
||||
// `Vec128::bytes[0]` is the most significant byte (PPC lane 0 in BE view).
|
||||
|
||||
pub fn load_vector_left(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
let n = 16 - shift;
|
||||
let mut bytes = [0u8; 16];
|
||||
for i in 0..n {
|
||||
bytes[i] = mem.read_u8(ea.wrapping_add(i as u32));
|
||||
}
|
||||
Vec128::from_bytes(bytes)
|
||||
}
|
||||
|
||||
pub fn load_vector_right(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
if shift == 0 { return Vec128::ZERO; }
|
||||
let base = ea & !0xFu32;
|
||||
let mut bytes = [0u8; 16];
|
||||
for i in 0..shift {
|
||||
bytes[16 - shift + i] = mem.read_u8(base.wrapping_add(i as u32));
|
||||
}
|
||||
Vec128::from_bytes(bytes)
|
||||
}
|
||||
|
||||
pub fn store_vector_left(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
let n = 16 - shift;
|
||||
let b = v.as_bytes();
|
||||
for i in 0..n {
|
||||
mem.write_u8(ea.wrapping_add(i as u32), b[i]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
|
||||
let shift = (ea & 0xF) as usize;
|
||||
if shift == 0 { return; }
|
||||
let base = ea & !0xFu32;
|
||||
let b = v.as_bytes();
|
||||
for i in 0..shift {
|
||||
mem.write_u8(base.wrapping_add(i as u32), b[16 - shift + i]);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ─────────────────────────
|
||||
// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel.
|
||||
// vupkhpx / vupklpx reverse the operation.
|
||||
//
|
||||
// Format: input 32-bit word holds
|
||||
// bits 0-6: unused (0)
|
||||
// bit 7: alpha-select (→ bit 15 of output)
|
||||
// bits 8-15: R (top 5 bits kept)
|
||||
// bits 16-23: G (top 5 bits kept)
|
||||
// bits 24-31: B (top 5 bits kept)
|
||||
// Output 16-bit word:
|
||||
// bit 15: A (from input bit 7)
|
||||
// bits 10-14: R
|
||||
// bits 5-9: G
|
||||
// bits 0-4: B
|
||||
|
||||
#[inline] pub fn pack_pixel_555(input: u32) -> u16 {
|
||||
let a = (input >> 7) & 0x1;
|
||||
let r = (input >> 8) & 0xFF;
|
||||
let g = (input >> 16) & 0xFF;
|
||||
let b = (input >> 24) & 0xFF;
|
||||
((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16
|
||||
}
|
||||
|
||||
#[inline] pub fn unpack_pixel_555(input: u16) -> u32 {
|
||||
let input = input as u32;
|
||||
let a = (input >> 15) & 0x1;
|
||||
let r = (input >> 10) & 0x1F;
|
||||
let g = (input >> 5) & 0x1F;
|
||||
let b = input & 0x1F;
|
||||
// Sign-extend A and replicate 5-bit RGB into the top of each byte.
|
||||
let a8 = if a != 0 { 0xFFu32 } else { 0 };
|
||||
let r8 = (r << 3) | (r >> 2);
|
||||
let g8 = (g << 3) | (g >> 2);
|
||||
let b8 = (b << 3) | (b >> 2);
|
||||
(a8 << 24) | (r8 << 16) | (g8 << 8) | b8
|
||||
}
|
||||
|
||||
// ─── VMX128 D3D pack/unpack dispatch ──────────────────────────────────────
|
||||
// `vpkd3d128` / `vupkd3d128` encode a small enum in the instruction word
|
||||
// (VX128_4 immediate field). The exact enum lives in canary's
|
||||
// ppc_emit_altivec.cc under PACK_TYPE_*; titles usually touch D3DCOLOR
|
||||
// (type 0) and a handful of texture-coordinate variants.
|
||||
//
|
||||
// Rather than risk getting a rarely-used sub-case wrong, we implement the
|
||||
// common types and fall back to a warning + pass-through for unknown types.
|
||||
// Returning the VB register value unchanged is always preferable to emitting
|
||||
// StepResult::Unimplemented because it keeps the interpreter running.
|
||||
|
||||
/// Pack-type encoding of `vpkd3d128` / `vupkd3d128`.
|
||||
///
|
||||
/// The immediate field lives at PPC bits 16-22 (VX128_3/4 IMM, 7 bits).
|
||||
/// Canary decodes `type = IMM >> 2` (top 5 bits) and `pack = IMM & 0x3`
|
||||
/// (low 2 bits, used only by `vpkd3d128` to select output-slot layout).
|
||||
/// Valid `type` values are 0..=6 per `ppc_emit_altivec.cc:2095-2118`:
|
||||
///
|
||||
/// | id | canary name | format |
|
||||
/// |----|-------------------|---------------------------------------|
|
||||
/// | 0 | VPACK_D3DCOLOR | 4 f32 [0,1] ↔ ARGB8 |
|
||||
/// | 1 | VPACK_NORMSHORT2 | 2 f32 [-1,1] ↔ 2× signed-normalized i16 |
|
||||
/// | 2 | VPACK_NORMPACKED32| 4 f32 [-1,1] ↔ UINT_2101010 (w:2,z:10,y:10,x:10) |
|
||||
/// | 3 | VPACK_FLOAT16_2 | 2 f32 ↔ 2× fp16 |
|
||||
/// | 4 | VPACK_NORMSHORT4 | 4 f32 [-1,1] ↔ 4× signed-normalized i16 |
|
||||
/// | 5 | VPACK_FLOAT16_4 | 4 f32 ↔ 4× fp16 |
|
||||
/// | 6 | VPACK_NORMPACKED64| 4 f32 [-1,1] ↔ ULONG_4202020 (w:4,z:20,y:20,x:20) |
|
||||
///
|
||||
/// Prior (M3-pre) this enum listed made-up "Normal16"/"Normal8"/"UByteN4"
|
||||
/// variants that didn't match canary; the immediate extraction was also
|
||||
/// wrong (LSB-numbered `>>6 & 0x7` instead of MSB-numbered `>>11 & 0x1F`
|
||||
/// against a 7-bit IMM field). M3 fixes both.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum D3dPackType {
|
||||
D3dColor,
|
||||
NormShort2,
|
||||
NormPacked32,
|
||||
Float16_2,
|
||||
NormShort4,
|
||||
Float16_4,
|
||||
NormPacked64,
|
||||
Other(u32),
|
||||
}
|
||||
|
||||
impl D3dPackType {
|
||||
/// Decode the `type` bits extracted from the VX128_3/4 IMM field via
|
||||
/// canary's `IMM >> 2` convention (i.e. the caller has already divided
|
||||
/// out the 2-bit `pack` subfield).
|
||||
pub fn from_immediate(type_bits: u32) -> Self {
|
||||
match type_bits {
|
||||
0 => Self::D3dColor,
|
||||
1 => Self::NormShort2,
|
||||
2 => Self::NormPacked32,
|
||||
3 => Self::Float16_2,
|
||||
4 => Self::NormShort4,
|
||||
5 => Self::Float16_4,
|
||||
6 => Self::NormPacked64,
|
||||
other => Self::Other(other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pack an f32x4 vector of [R, G, B, A] in [0.0, 1.0] into a single D3DCOLOR
|
||||
/// value in lane 3 of the output.
|
||||
pub fn pack_d3dcolor(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let to_byte = |x: f32| -> u32 {
|
||||
let c = x.clamp(0.0, 1.0) * 255.0;
|
||||
(c + 0.5) as u32 & 0xFF
|
||||
};
|
||||
// D3DCOLOR is A,R,G,B in that byte order inside a u32.
|
||||
let word = (to_byte(f[3]) << 24) | (to_byte(f[0]) << 16) | (to_byte(f[1]) << 8) | to_byte(f[2]);
|
||||
Vec128::from_u32x4(0, 0, 0, word)
|
||||
}
|
||||
|
||||
/// Unpack a D3DCOLOR value (in lane 3 of the input) into an f32x4 [R, G, B, A].
|
||||
pub fn unpack_d3dcolor(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let a = ((word >> 24) & 0xFF) as f32 / 255.0;
|
||||
let r = ((word >> 16) & 0xFF) as f32 / 255.0;
|
||||
let g = ((word >> 8) & 0xFF) as f32 / 255.0;
|
||||
let b = (word & 0xFF) as f32 / 255.0;
|
||||
Vec128::from_f32x4(r, g, b, a)
|
||||
}
|
||||
|
||||
// ───────────────────────────────────────────────────────────────────────
|
||||
// First-Pixels M3 — pack/unpack for the remaining canary pack types.
|
||||
//
|
||||
// Conventions shared across all helpers:
|
||||
// * Input-to-`unpack_*` (packed data) lives in the *source* lane position
|
||||
// canary's HIR assumes: canonically the 32-bit word is in lane 3 and
|
||||
// the 64-bit value straddles lanes 2-3. We match that so the existing
|
||||
// D3DCOLOR helpers' 3-lane convention is preserved across the whole
|
||||
// pack-type family.
|
||||
// * Output-from-`pack_*` sits in the same lane(s). The caller usually
|
||||
// follows with a permute to move it elsewhere (the VX128_4 `pack`
|
||||
// subfield controls that in `vpkd3d128`).
|
||||
// * Range semantics match canary: normalized types use `max` = (1<<N-1)-1
|
||||
// for signed, clamp before rounding.
|
||||
// ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[inline]
|
||||
fn norm_to_i16(x: f32) -> i16 {
|
||||
let c = x.clamp(-1.0, 1.0) * 32767.0;
|
||||
// Round half away from zero, matching canary's `vcfsx` semantics.
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
r.clamp(-32768, 32767) as i16
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn i16_to_norm(s: i16) -> f32 {
|
||||
(s as f32) / 32767.0
|
||||
}
|
||||
|
||||
/// **NORMSHORT2** — 2 f32s in [-1, 1] → two 16-bit signed-normalized
|
||||
/// shorts packed as `(x << 16) | y` in lane 3 (high 32 bits of the word
|
||||
/// hold X; low 16 hold Y). Output lanes 0..=2 are zero-filled.
|
||||
pub fn pack_normshort2(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = norm_to_i16(f[0]) as u16 as u32;
|
||||
let y = norm_to_i16(f[1]) as u16 as u32;
|
||||
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
|
||||
}
|
||||
|
||||
pub fn unpack_normshort2(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let x = i16_to_norm((word >> 16) as i16);
|
||||
let y = i16_to_norm(word as i16);
|
||||
Vec128::from_f32x4(x, y, 0.0, 1.0)
|
||||
}
|
||||
|
||||
/// **NORMSHORT4** — 4 f32s in [-1, 1] → four 16-bit signed-normalized
|
||||
/// shorts packed across lanes 2-3 (big-endian dword order: X in the
|
||||
/// high word of lane 2, Y low of lane 2, Z high of lane 3, W low of lane
|
||||
/// 3).
|
||||
pub fn pack_normshort4(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = norm_to_i16(f[0]) as u16 as u32;
|
||||
let y = norm_to_i16(f[1]) as u16 as u32;
|
||||
let z = norm_to_i16(f[2]) as u16 as u32;
|
||||
let w = norm_to_i16(f[3]) as u16 as u32;
|
||||
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
|
||||
}
|
||||
|
||||
pub fn unpack_normshort4(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2);
|
||||
let lo = v.u32x4(3);
|
||||
let x = i16_to_norm((hi >> 16) as i16);
|
||||
let y = i16_to_norm(hi as i16);
|
||||
let z = i16_to_norm((lo >> 16) as i16);
|
||||
let w = i16_to_norm(lo as i16);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// **NORMPACKED32** — UINT_2101010 layout, 4 f32s in [-1, 1] packed into
|
||||
/// 32 bits in lane 3. Per canary's comment `2_10_10_10 w_z_y_x`: the
|
||||
/// high 2 bits hold W (signed 2-bit, -2..=1), then Z/Y/X each use 10
|
||||
/// signed-normalized bits.
|
||||
pub fn pack_normpacked32(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
#[inline]
|
||||
fn n10(x: f32) -> u32 {
|
||||
let c = x.clamp(-1.0, 1.0) * 511.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
(r.clamp(-512, 511) as i32 as u32) & 0x3FF
|
||||
}
|
||||
#[inline]
|
||||
fn n2(x: f32) -> u32 {
|
||||
let c = x.clamp(-1.0, 1.0) * 1.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
|
||||
(r.clamp(-2, 1) as i32 as u32) & 0x3
|
||||
}
|
||||
let x = n10(f[0]);
|
||||
let y = n10(f[1]);
|
||||
let z = n10(f[2]);
|
||||
let w = n2(f[3]);
|
||||
let word = (w << 30) | (z << 20) | (y << 10) | x;
|
||||
Vec128::from_u32x4(0, 0, 0, word)
|
||||
}
|
||||
|
||||
pub fn unpack_normpacked32(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
#[inline]
|
||||
fn u10_to_norm(bits: u32) -> f32 {
|
||||
// Sign-extend the 10-bit field then normalize.
|
||||
let s = ((bits & 0x3FF) as i32) << 22 >> 22;
|
||||
(s as f32) / 511.0
|
||||
}
|
||||
#[inline]
|
||||
fn u2_to_norm(bits: u32) -> f32 {
|
||||
let s = ((bits & 0x3) as i32) << 30 >> 30;
|
||||
(s as f32).clamp(-1.0, 1.0)
|
||||
}
|
||||
let x = u10_to_norm(word);
|
||||
let y = u10_to_norm(word >> 10);
|
||||
let z = u10_to_norm(word >> 20);
|
||||
let w = u2_to_norm(word >> 30);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// **NORMPACKED64** — ULONG_4202020, 4 f32s in [-1, 1] packed into 64
|
||||
/// bits across lanes 2-3. Per canary's comment `4_20_20_20 w_z_y_x`:
|
||||
/// the high 4 bits of the dword hold W (signed 4-bit); the remaining 60
|
||||
/// bits hold 3× 20-bit signed-normalized Z/Y/X. Rare outside very few
|
||||
/// titles (canary notes 54540829).
|
||||
pub fn pack_normpacked64(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
#[inline]
|
||||
fn n20(x: f32) -> u64 {
|
||||
let c = x.clamp(-1.0, 1.0) * 524287.0; // 2^19 - 1
|
||||
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
|
||||
(r.clamp(-524288, 524287) as i64 as u64) & 0xF_FFFF
|
||||
}
|
||||
#[inline]
|
||||
fn n4(x: f32) -> u64 {
|
||||
let c = x.clamp(-1.0, 1.0) * 7.0;
|
||||
let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
|
||||
(r.clamp(-8, 7) as i64 as u64) & 0xF
|
||||
}
|
||||
let x = n20(f[0]);
|
||||
let y = n20(f[1]);
|
||||
let z = n20(f[2]);
|
||||
let w = n4(f[3]);
|
||||
let dw: u64 = (w << 60) | (z << 40) | (y << 20) | x;
|
||||
Vec128::from_u32x4(0, 0, (dw >> 32) as u32, dw as u32)
|
||||
}
|
||||
|
||||
pub fn unpack_normpacked64(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2) as u64;
|
||||
let lo = v.u32x4(3) as u64;
|
||||
let dw = (hi << 32) | lo;
|
||||
#[inline]
|
||||
fn u20_to_norm(bits: u64) -> f32 {
|
||||
let s = ((bits & 0xF_FFFF) as i64) << 44 >> 44;
|
||||
(s as f32) / 524287.0
|
||||
}
|
||||
#[inline]
|
||||
fn u4_to_norm(bits: u64) -> f32 {
|
||||
let s = ((bits & 0xF) as i64) << 60 >> 60;
|
||||
(s as f32) / 7.0
|
||||
}
|
||||
let x = u20_to_norm(dw);
|
||||
let y = u20_to_norm(dw >> 20);
|
||||
let z = u20_to_norm(dw >> 40);
|
||||
let w = u4_to_norm(dw >> 60);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
/// IEEE 754 half-precision float pack/unpack — used by both FLOAT16_2
|
||||
/// and FLOAT16_4. No FMA quirks involved; we go via `f32::to_bits` and
|
||||
/// manual bit-twiddling (the stable-Rust `f16` type isn't available
|
||||
/// yet).
|
||||
#[inline]
|
||||
fn f32_to_f16_bits(f: f32) -> u16 {
|
||||
let bits = f.to_bits();
|
||||
let sign = ((bits >> 31) & 0x1) as u16;
|
||||
let exp = ((bits >> 23) & 0xFF) as i32;
|
||||
let mant = bits & 0x7FFFFF;
|
||||
// Handle the easy cases first.
|
||||
if exp == 0xFF {
|
||||
// NaN or infinity.
|
||||
let half_exp = 0x1F;
|
||||
let half_mant = if mant != 0 { 0x200 } else { 0 }; // quiet NaN / zero mantissa for Inf
|
||||
return (sign << 15) | (half_exp << 10) | half_mant;
|
||||
}
|
||||
let unbiased_exp = exp - 127;
|
||||
if unbiased_exp >= 16 {
|
||||
// Overflow → infinity.
|
||||
return (sign << 15) | (0x1F << 10);
|
||||
}
|
||||
if unbiased_exp <= -15 {
|
||||
// Denormal or zero. Compute the shift and subnormal mantissa;
|
||||
// anything too small flushes to signed zero.
|
||||
if unbiased_exp < -24 {
|
||||
return sign << 15;
|
||||
}
|
||||
let shift = -14 - unbiased_exp as i32; // amount to shift the implicit-1'd mantissa
|
||||
let full_mant = 0x800000 | mant; // 24 bits with implicit leading 1
|
||||
let half_mant = (full_mant >> (shift + 13)) as u16;
|
||||
return (sign << 15) | half_mant;
|
||||
}
|
||||
let half_exp = ((unbiased_exp + 15) as u16) & 0x1F;
|
||||
let half_mant = (mant >> 13) as u16;
|
||||
(sign << 15) | (half_exp << 10) | half_mant
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn f16_bits_to_f32(h: u16) -> f32 {
|
||||
let sign = ((h >> 15) & 0x1) as u32;
|
||||
let exp = ((h >> 10) & 0x1F) as i32;
|
||||
let mant = (h & 0x3FF) as u32;
|
||||
let bits = if exp == 0x1F {
|
||||
// NaN or infinity.
|
||||
let f32_exp = 0xFFu32;
|
||||
let f32_mant = if mant != 0 { 0x400000 } else { 0 };
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
} else if exp == 0 && mant == 0 {
|
||||
// Signed zero.
|
||||
sign << 31
|
||||
} else if exp == 0 {
|
||||
// Subnormal — renormalize.
|
||||
let mut e = -14i32;
|
||||
let mut m = mant;
|
||||
while (m & 0x400) == 0 {
|
||||
m <<= 1;
|
||||
e -= 1;
|
||||
}
|
||||
let f32_exp = ((e + 127) as u32) & 0xFF;
|
||||
let f32_mant = (m & 0x3FF) << 13;
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
} else {
|
||||
let f32_exp = ((exp - 15 + 127) as u32) & 0xFF;
|
||||
let f32_mant = mant << 13;
|
||||
(sign << 31) | (f32_exp << 23) | f32_mant
|
||||
};
|
||||
f32::from_bits(bits)
|
||||
}
|
||||
|
||||
/// **FLOAT16_2** — two 32-bit floats → two half-floats packed into one
|
||||
/// 32-bit word (X in high 16 bits of lane 3, Y in low 16).
|
||||
pub fn pack_float16_2(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = f32_to_f16_bits(f[0]) as u32;
|
||||
let y = f32_to_f16_bits(f[1]) as u32;
|
||||
Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
|
||||
}
|
||||
|
||||
pub fn unpack_float16_2(v: Vec128) -> Vec128 {
|
||||
let word = v.u32x4(3);
|
||||
let x = f16_bits_to_f32((word >> 16) as u16);
|
||||
let y = f16_bits_to_f32(word as u16);
|
||||
Vec128::from_f32x4(x, y, 0.0, 1.0)
|
||||
}
|
||||
|
||||
/// **FLOAT16_4** — four 32-bit floats → four half-floats packed across
|
||||
/// 64 bits (lanes 2-3).
|
||||
pub fn pack_float16_4(v: Vec128) -> Vec128 {
|
||||
let f = v.as_f32x4();
|
||||
let x = f32_to_f16_bits(f[0]) as u32;
|
||||
let y = f32_to_f16_bits(f[1]) as u32;
|
||||
let z = f32_to_f16_bits(f[2]) as u32;
|
||||
let w = f32_to_f16_bits(f[3]) as u32;
|
||||
Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
|
||||
}
|
||||
|
||||
pub fn unpack_float16_4(v: Vec128) -> Vec128 {
|
||||
let hi = v.u32x4(2);
|
||||
let lo = v.u32x4(3);
|
||||
let x = f16_bits_to_f32((hi >> 16) as u16);
|
||||
let y = f16_bits_to_f32(hi as u16);
|
||||
let z = f16_bits_to_f32((lo >> 16) as u16);
|
||||
let w = f16_bits_to_f32(lo as u16);
|
||||
Vec128::from_f32x4(x, y, z, w)
|
||||
}
|
||||
|
||||
// ─── CR6 helpers used by integer compares ─────────────────────────────────
|
||||
// vcmp*. (record-form) updates CR6 in a compressed form:
|
||||
// CR6 = {all-true, 0, all-false, 0}
|
||||
// where each bit reflects the per-lane mask across the whole register.
|
||||
|
||||
#[inline] pub fn cr6_flags_from_mask(mask: Vec128) -> (bool, bool) {
|
||||
let b = mask.as_bytes();
|
||||
let mut any_set = false;
|
||||
let mut any_clear = false;
|
||||
for &byte in b.iter() {
|
||||
if byte != 0 { any_set = true; }
|
||||
if byte != 0xFF { any_clear = true; }
|
||||
}
|
||||
let all_true = !any_clear;
|
||||
let all_false = !any_set;
|
||||
(all_true, all_false)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
struct TestMem { data: Box<[Cell<u8>]> }
|
||||
impl TestMem {
|
||||
fn new(size: usize) -> Self {
|
||||
Self { data: (0..size).map(|_| Cell::new(0)).collect() }
|
||||
}
|
||||
}
|
||||
impl MemoryAccess for TestMem {
|
||||
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
u16::from_be_bytes([self.data[a as usize].get(), self.data[a as usize + 1].get()])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
let a = a as usize;
|
||||
u32::from_be_bytes([
|
||||
self.data[a].get(), self.data[a+1].get(),
|
||||
self.data[a+2].get(), self.data[a+3].get(),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
let a = a as usize;
|
||||
u64::from_be_bytes([
|
||||
self.data[a].get(), self.data[a+1].get(),
|
||||
self.data[a+2].get(), self.data[a+3].get(),
|
||||
self.data[a+4].get(), self.data[a+5].get(),
|
||||
self.data[a+6].get(), self.data[a+7].get(),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let b = v.to_be_bytes();
|
||||
self.data[a as usize].set(b[0]);
|
||||
self.data[a as usize + 1].set(b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let b = v.to_be_bytes(); let a = a as usize;
|
||||
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let b = v.to_be_bytes(); let a = a as usize;
|
||||
for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
|
||||
}
|
||||
fn translate(&self, _a: u32) -> Option<*const u8> { None }
|
||||
fn translate_mut(&self, _a: u32) -> Option<*mut u8> { None }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvlx_lvrx_round_trip() {
|
||||
let m = TestMem::new(0x40);
|
||||
for i in 0..0x30 { m.data[i].set((i as u8).wrapping_add(0x10)); }
|
||||
// Unaligned load from 0x13 should combine lvlx(0x13) | lvrx(0x23).
|
||||
let lo = load_vector_left(&m, 0x13);
|
||||
let hi = load_vector_right(&m, 0x23);
|
||||
let mut combined = [0u8; 16];
|
||||
let lob = lo.as_bytes();
|
||||
let hib = hi.as_bytes();
|
||||
for i in 0..16 { combined[i] = lob[i] | hib[i]; }
|
||||
for i in 0..16 {
|
||||
assert_eq!(combined[i], m.data[0x13 + i].get(), "lane {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvlx_aligned_is_full_load() {
|
||||
let m = TestMem::new(0x20);
|
||||
for i in 0..0x20 { m.data[i].set(i as u8); }
|
||||
let v = load_vector_left(&m, 0x10);
|
||||
let b = v.as_bytes();
|
||||
for i in 0..16 { assert_eq!(b[i], 0x10 + i as u8); }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lvrx_aligned_is_zero() {
|
||||
let m = TestMem::new(0x20);
|
||||
let v = load_vector_right(&m, 0x10);
|
||||
assert_eq!(v.as_bytes(), [0u8; 16]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sat_add_signed_overflow() {
|
||||
assert_eq!(sat_add_i8(120, 10), (127, true));
|
||||
assert_eq!(sat_add_i8(-120, -10), (-128, true));
|
||||
assert_eq!(sat_add_i8(1, 2), (3, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sat_sub_unsigned_underflow() {
|
||||
assert_eq!(sat_sub_u8(5, 10), (0, true));
|
||||
assert_eq!(sat_sub_u8(10, 5), (5, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_unpack_pixel_555() {
|
||||
let encoded = pack_pixel_555(0x80_F8_F8_F8);
|
||||
assert_eq!(encoded & 0x8000, 0x8000);
|
||||
let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F);
|
||||
assert_eq!(w & 0xFF000000, 0xFF000000);
|
||||
}
|
||||
|
||||
// ─── First-Pixels M3 pack/unpack roundtrip tests ───
|
||||
|
||||
/// Quantization error tolerance for N-bit signed normalized values.
|
||||
/// `1.0 / ((1 << (bits - 1)) - 1)` is the step size.
|
||||
fn tol_normalized(bits: u32) -> f32 {
|
||||
1.0 / ((1u32 << (bits - 1)) - 1) as f32
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normshort2_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.75, 0.0, 0.0);
|
||||
let packed = pack_normshort2(v);
|
||||
let back = unpack_normshort2(packed).as_f32x4();
|
||||
let tol = tol_normalized(16);
|
||||
assert!((back[0] - 0.5).abs() < tol, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.75).abs() < tol, "y got {}", back[1]);
|
||||
assert_eq!(back[2], 0.0);
|
||||
assert_eq!(back[3], 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normshort4_roundtrip_extremes() {
|
||||
let v = Vec128::from_f32x4(1.0, -1.0, 0.0, 0.25);
|
||||
let packed = pack_normshort4(v);
|
||||
let back = unpack_normshort4(packed).as_f32x4();
|
||||
let tol = tol_normalized(16);
|
||||
assert!((back[0] - 1.0).abs() < tol);
|
||||
assert!((back[1] - -1.0).abs() < tol);
|
||||
assert!((back[2] - 0.0).abs() < tol);
|
||||
assert!((back[3] - 0.25).abs() < tol);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normpacked32_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.5, 0.9, -1.0);
|
||||
let packed = pack_normpacked32(v);
|
||||
let back = unpack_normpacked32(packed).as_f32x4();
|
||||
let tol10 = tol_normalized(10);
|
||||
let tol2 = tol_normalized(2);
|
||||
assert!((back[0] - 0.5).abs() < tol10, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.5).abs() < tol10, "y got {}", back[1]);
|
||||
assert!((back[2] - 0.9).abs() < tol10, "z got {}", back[2]);
|
||||
// 2-bit signed quantizes to {-1, -0.5-ish, 0, 0.5-ish}; tolerance
|
||||
// is the full step.
|
||||
assert!((back[3] - -1.0).abs() < 2.0 * tol2, "w got {}", back[3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normpacked64_roundtrip() {
|
||||
let v = Vec128::from_f32x4(0.5, -0.25, 0.75, 0.5);
|
||||
let packed = pack_normpacked64(v);
|
||||
let back = unpack_normpacked64(packed).as_f32x4();
|
||||
let tol20 = tol_normalized(20);
|
||||
let tol4 = tol_normalized(4);
|
||||
assert!((back[0] - 0.5).abs() < tol20, "x got {}", back[0]);
|
||||
assert!((back[1] - -0.25).abs() < tol20, "y got {}", back[1]);
|
||||
assert!((back[2] - 0.75).abs() < tol20, "z got {}", back[2]);
|
||||
assert!((back[3] - 0.5).abs() < tol4, "w got {}", back[3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_2_roundtrip_normals() {
|
||||
// Half has ~3 decimal digits of precision. Pick values that
|
||||
// survive conversion cleanly: powers of 2 + simple fractions.
|
||||
let v = Vec128::from_f32x4(1.0, -2.5, 0.0, 0.0);
|
||||
let packed = pack_float16_2(v);
|
||||
let back = unpack_float16_2(packed).as_f32x4();
|
||||
assert_eq!(back[0], 1.0);
|
||||
assert_eq!(back[1], -2.5);
|
||||
assert_eq!(back[2], 0.0);
|
||||
assert_eq!(back[3], 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_4_roundtrip_normals() {
|
||||
let v = Vec128::from_f32x4(0.5, -3.0, 16.0, -0.125);
|
||||
let packed = pack_float16_4(v);
|
||||
let back = unpack_float16_4(packed).as_f32x4();
|
||||
assert_eq!(back[0], 0.5);
|
||||
assert_eq!(back[1], -3.0);
|
||||
assert_eq!(back[2], 16.0);
|
||||
assert_eq!(back[3], -0.125);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn float16_handles_zero_and_infinity() {
|
||||
// Zero should survive.
|
||||
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(0.0)), 0.0);
|
||||
assert_eq!(f16_bits_to_f32(f32_to_f16_bits(-0.0)).to_bits(), (-0.0f32).to_bits());
|
||||
// +inf.
|
||||
let inf_back = f16_bits_to_f32(f32_to_f16_bits(f32::INFINITY));
|
||||
assert!(inf_back.is_infinite() && inf_back > 0.0);
|
||||
// Overflow → +inf.
|
||||
let overflow_back = f16_bits_to_f32(f32_to_f16_bits(65536.0));
|
||||
assert!(overflow_back.is_infinite());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pack_type_enum_maps_canary_values() {
|
||||
use D3dPackType::*;
|
||||
assert!(matches!(D3dPackType::from_immediate(0), D3dColor));
|
||||
assert!(matches!(D3dPackType::from_immediate(1), NormShort2));
|
||||
assert!(matches!(D3dPackType::from_immediate(2), NormPacked32));
|
||||
assert!(matches!(D3dPackType::from_immediate(3), Float16_2));
|
||||
assert!(matches!(D3dPackType::from_immediate(4), NormShort4));
|
||||
assert!(matches!(D3dPackType::from_immediate(5), Float16_4));
|
||||
assert!(matches!(D3dPackType::from_immediate(6), NormPacked64));
|
||||
assert!(matches!(D3dPackType::from_immediate(7), Other(7)));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user