xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches

Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:27:43 +02:00
parent e9b2b57a44
commit c36cca14f9
20 changed files with 12284 additions and 458 deletions
--- a/crates/xenia-cpu/src/block_cache.rs
+++ b/crates/xenia-cpu/src/block_cache.rs
@@ -0,0 +1,423 @@
+//! Tier-4 perf — basic-block cache for the PPC interpreter.
+//!
+//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction
+//! per slot, indexed by PC. The hot loop still pays the per-instruction
+//! cost of fetching the raw word, hashing the PC into a slot, and
+//! comparing tags. For straight-line code — common in the asset/inflate
+//! loops where Sylpheed boot is currently CPU-bound — the savings of
+//! batching N decoded instructions per slot lookup are linear in block
+//! length.
+//!
+//! ## Shape
+//!
+//! A `DecodedBlock` is a contiguous run of decoded instructions starting
+//! at `start_pc`, ending at the first *block terminator* (any branch,
+//! `sc`, trap, or `Invalid`) or at one of two safety limits:
+//!
+//!  - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost.
+//!  - 4 KiB page boundary stop. A block is fully contained inside a
+//!    single 4 KiB guest page; that means `mem.page_version(start_pc)`
+//!    is sufficient to detect any code-page rewrite that should
+//!    invalidate the block. Without this rule the cache would have to
+//!    walk every spanned page on every hit, which would erase the win.
+//!
+//! ## Invalidation
+//!
+//! Each block stamps the page version at build time. On lookup, if
+//! `mem.page_version(start_pc)` differs from `block.page_version`, the
+//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at
+//! block granularity.
+//!
+//! ## Debugger semantics
+//!
+//! Block dispatch is **opt-in** by the caller. The hot loop in
+//! `xenia-app/src/main.rs` selects the per-instruction path whenever
+//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set.
+//! That's how single-step, breakpoints, in-memory trace, instruction
+//! trace, and branch trace continue to observe every PC: the block
+//! cache simply never runs in those modes.
+
+use crate::decoder::{decode, DecodedInstr};
+use xenia_memory::MemoryAccess;
+
+/// Direct-mapped block-cache slot count. Same shape as
+/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the
+/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the
+/// slot collision rate is negligible.
+const BLOCK_CACHE_SIZE: usize = 1 << 16;
+const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32;
+
+/// Hard cap on instructions per block. Keeps the worst-case memory
+/// footprint bounded and limits the rebuild cost when a code page
+/// gets bumped. 32 instructions is generous for most basic blocks
+/// (real-world average across Sylpheed boot is ~6 between branches).
+pub const MAX_BLOCK_INSTRS: usize = 32;
+
+/// Guest page size — duplicated here to avoid pulling
+/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync
+/// with the memory crate. Both refer to the architectural PowerPC 4 KiB
+/// page granule, so this constant is locked.
+const GUEST_PAGE_SIZE: u32 = 4096;
+const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1);
+
+/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock`
+/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and
+/// stays valid until the next `lookup_or_build` on the same slot.
+#[derive(Debug)]
+pub struct DecodedBlock {
+    /// Guest PC at which this block starts. Used as the slot tag.
+    pub start_pc: u32,
+    /// Guest PC immediately after the last instruction in `instrs`.
+    /// Equal to `instrs.last().addr + 4` whether or not the block
+    /// ended on a terminator. Useful for tracing / disassembly.
+    pub end_pc: u32,
+    /// `mem.page_version(start_pc)` at build time. Mismatch on lookup
+    /// invalidates the block. Single value because every block is
+    /// page-bounded by construction.
+    pub page_version: u64,
+    /// Decoded instructions in execution order. Always non-empty after
+    /// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
+    /// pushes the first decoded word unconditionally).
+    pub instrs: Vec<DecodedInstr>,
+}
+
+/// Per-slot status from a `lookup_or_build` probe. Internal only.
+enum CacheStatus {
+    /// Block at this slot matches `pc` and the page version at build
+    /// time matches `mem.page_version(pc)` — return as-is.
+    Hit,
+    /// Block at this slot matched `pc` but the page version has
+    /// advanced — rebuild and bump `invalidations`.
+    Stale,
+    /// Slot is empty or holds a block keyed at a different `start_pc`.
+    /// Build a fresh block and bump `misses`.
+    Miss,
+}
+
+/// Direct-mapped block cache. One instance shared across all HW slots
+/// (block contents are PC-only and read-only after fill). Not
+/// thread-safe — owner is the single scheduler thread, same as
+/// `DecodeCache`.
+pub struct BlockCache {
+    slots: Box<[Option<Box<DecodedBlock>>]>,
+    hits: u64,
+    misses: u64,
+    invalidations: u64,
+}
+
+impl Default for BlockCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BlockCache {
+    pub fn new() -> Self {
+        // `Option<Box<T>>` is a niche-optimized 8-byte slot; 64 K of
+        // them cost ~512 KiB of cold storage. Live blocks beyond that
+        // sit on the heap.
+        let mut v: Vec<Option<Box<DecodedBlock>>> = Vec::with_capacity(BLOCK_CACHE_SIZE);
+        v.resize_with(BLOCK_CACHE_SIZE, || None);
+        Self {
+            slots: v.into_boxed_slice(),
+            hits: 0,
+            misses: 0,
+            invalidations: 0,
+        }
+    }
+
+    pub fn hits(&self) -> u64 {
+        self.hits
+    }
+    pub fn misses(&self) -> u64 {
+        self.misses
+    }
+    pub fn invalidations(&self) -> u64 {
+        self.invalidations
+    }
+
+    /// Return the cached block starting at `pc`, building it if absent
+    /// or stale. The returned reference is borrowed from the cache and
+    /// stays valid until the next `lookup_or_build` call.
+    pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock {
+        let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize;
+        let cur_pv = mem.page_version(pc);
+
+        // Phase 1: classify the slot. Borrow ends before fill so the
+        // mutable update below doesn't conflict.
+        let status = match &self.slots[idx] {
+            Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit,
+            Some(b) if b.start_pc == pc => CacheStatus::Stale,
+            _ => CacheStatus::Miss,
+        };
+
+        // Phase 2: fill on miss/stale, account.
+        match status {
+            CacheStatus::Hit => {
+                self.hits += 1;
+            }
+            CacheStatus::Stale => {
+                self.invalidations += 1;
+                self.misses += 1;
+                let block = build_block(pc, mem, cur_pv);
+                self.slots[idx] = Some(Box::new(block));
+            }
+            CacheStatus::Miss => {
+                self.misses += 1;
+                let block = build_block(pc, mem, cur_pv);
+                self.slots[idx] = Some(Box::new(block));
+            }
+        }
+
+        // Slot is guaranteed populated at this point — Hit returned a
+        // pre-existing block, Miss/Stale just wrote a new one.
+        self.slots[idx]
+            .as_deref()
+            .expect("block freshly built or hit")
+    }
+}
+
+/// Walk forward from `pc`, decoding instructions and collecting them
+/// into a `DecodedBlock`. The walk stops on the first of:
+///   - a [`PpcOpcode::terminates_block`] true (the terminator IS
+///     included as the last instruction),
+///   - reaching [`MAX_BLOCK_INSTRS`],
+///   - the next PC would cross a 4 KiB guest page boundary.
+fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock {
+    let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
+    let page_base = start_pc & GUEST_PAGE_MASK;
+    let mut cur = start_pc;
+
+    loop {
+        let raw = mem.read_u32(cur);
+        let decoded = decode(raw, cur);
+        let terminates = decoded.opcode.terminates_block();
+        instrs.push(decoded);
+
+        if terminates {
+            break;
+        }
+        if instrs.len() >= MAX_BLOCK_INSTRS {
+            break;
+        }
+        let next = cur.wrapping_add(4);
+        if (next & GUEST_PAGE_MASK) != page_base {
+            break;
+        }
+        cur = next;
+    }
+
+    let last = instrs.last().expect("build pushes at least one instruction");
+    let end_pc = last.addr.wrapping_add(4);
+
+    DecodedBlock {
+        start_pc,
+        end_pc,
+        page_version,
+        instrs,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::opcode::PpcOpcode;
+
+    use std::cell::Cell;
+
+    /// 64 KiB byte-array memory, big-endian word reads.
+    /// Mirrors `interpreter::tests::TestMem` but lives here so block_cache
+    /// tests don't depend on interpreter internals.
+    struct BlockTestMem {
+        data: Box<[Cell<u8>]>,
+        version_a: u64,
+        version_b: u64,
+        // Address of the page whose version is `version_b` instead of
+        // `version_a`. Used to model an out-of-band page-version bump in
+        // the invalidation test without going through write_*.
+        bumped_page: Cell<Option<u32>>,
+    }
+
+    impl BlockTestMem {
+        fn new() -> Self {
+            Self {
+                data: (0..0x10000u32).map(|_| Cell::new(0)).collect(),
+                version_a: 1,
+                version_b: 2,
+                bumped_page: Cell::new(None),
+            }
+        }
+        fn put(&self, addr: u32, raw: u32) {
+            let a = addr as usize;
+            for (i, byte) in raw.to_be_bytes().iter().enumerate() {
+                self.data[a + i].set(*byte);
+            }
+        }
+    }
+
+    impl MemoryAccess for BlockTestMem {
+        fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
+        fn read_u16(&self, a: u32) -> u16 {
+            let i = a as usize;
+            u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()])
+        }
+        fn read_u32(&self, a: u32) -> u32 {
+            let i = a as usize;
+            u32::from_be_bytes([
+                self.data[i].get(), self.data[i + 1].get(),
+                self.data[i + 2].get(), self.data[i + 3].get(),
+            ])
+        }
+        fn read_u64(&self, a: u32) -> u64 {
+            let i = a as usize;
+            u64::from_be_bytes([
+                self.data[i].get(), self.data[i + 1].get(),
+                self.data[i + 2].get(), self.data[i + 3].get(),
+                self.data[i + 4].get(), self.data[i + 5].get(),
+                self.data[i + 6].get(), self.data[i + 7].get(),
+            ])
+        }
+        fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
+        fn write_u16(&self, a: u32, v: u16) {
+            let i = a as usize;
+            let b = v.to_be_bytes();
+            self.data[i].set(b[0]);
+            self.data[i + 1].set(b[1]);
+        }
+        fn write_u32(&self, a: u32, v: u32) {
+            let i = a as usize;
+            for (k, byte) in v.to_be_bytes().iter().enumerate() {
+                self.data[i + k].set(*byte);
+            }
+        }
+        fn write_u64(&self, a: u32, v: u64) {
+            let i = a as usize;
+            for (k, byte) in v.to_be_bytes().iter().enumerate() {
+                self.data[i + k].set(*byte);
+            }
+        }
+        fn translate(&self, _: u32) -> Option<*const u8> { None }
+        fn translate_mut(&self, _: u32) -> Option<*mut u8> { None }
+        fn page_version(&self, addr: u32) -> u64 {
+            if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() {
+                self.version_b
+            } else {
+                self.version_a
+            }
+        }
+    }
+
+    // PPC encodings — minimal subset for these tests.
+    fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
+        (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
+    }
+    fn enc_b_self() -> u32 {
+        // b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0.
+        18 << 26
+    }
+    fn enc_unimplemented() -> u32 {
+        // Use opcode 0 raw = 0; decoder maps to Invalid.
+        0
+    }
+
+    #[test]
+    fn block_built_to_terminator() {
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, enc_addi(3, 3, 1));
+        mem.put(0x108, enc_addi(3, 3, 1));
+        mem.put(0x10C, enc_b_self()); // terminator
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(b.start_pc, 0x100);
+        assert_eq!(b.instrs.len(), 4);
+        assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx);
+        assert_eq!(b.end_pc, 0x110);
+    }
+
+    #[test]
+    fn block_stops_at_page_boundary() {
+        // Build from 0x1FFC. The next PC (0x2000) is in a different
+        // 4 KiB page — block must contain only the one instruction.
+        let mem = BlockTestMem::new();
+        mem.put(0x1FFC, enc_addi(3, 3, 1));
+        mem.put(0x2000, enc_addi(3, 3, 1));
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x1FFC, &mem);
+        assert_eq!(b.instrs.len(), 1);
+        assert_eq!(b.end_pc, 0x2000);
+    }
+
+    #[test]
+    fn block_stops_at_max_len() {
+        // 64 consecutive non-terminator instructions on one page —
+        // block must clamp at MAX_BLOCK_INSTRS.
+        let mem = BlockTestMem::new();
+        for i in 0..64u32 {
+            mem.put(0x100 + i * 4, enc_addi(3, 3, 1));
+        }
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS);
+        assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4);
+    }
+
+    #[test]
+    fn block_stops_at_invalid_opcode() {
+        // Decoder mapping `Invalid` is treated as a block terminator
+        // so the per-instruction Unimplemented path is preserved.
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, enc_unimplemented());
+        mem.put(0x108, enc_addi(3, 3, 1));
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(b.instrs.len(), 2);
+        assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid);
+    }
+
+    #[test]
+    fn block_invalidates_on_page_version_bump() {
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, enc_b_self());
+        let mut bc = BlockCache::new();
+        let _ = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(bc.misses(), 1);
+        assert_eq!(bc.hits(), 0);
+
+        // Same call → hit.
+        let _ = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(bc.hits(), 1);
+        assert_eq!(bc.invalidations(), 0);
+
+        // Bump the page version on the page containing 0x100. Next
+        // lookup must invalidate and rebuild.
+        mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK));
+        let _ = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(bc.invalidations(), 1);
+        assert_eq!(bc.misses(), 2);
+    }
+
+    #[test]
+    fn block_hit_returns_same_contents() {
+        // Sanity: cache hit returns a block whose contents reflect the
+        // ORIGINAL instruction stream, even after a non-version-bumping
+        // poke to the underlying bytes. (No real workload would do
+        // this, but it confirms we're returning cached data, not
+        // re-reading.)
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 7));
+        mem.put(0x104, enc_b_self());
+        let mut bc = BlockCache::new();
+        let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
+        // Rewrite without bumping version (test-only path).
+        let bytes = enc_addi(3, 3, 99).to_be_bytes();
+        for (i, b) in bytes.iter().enumerate() {
+            mem.data[0x100 + i].set(*b);
+        }
+        let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
+        assert_eq!(first_simm, 7);
+        assert_eq!(cached_simm, 7, "cache must serve original decoded form");
+    }
+}
--- a/crates/xenia-cpu/src/context.rs
+++ b/crates/xenia-cpu/src/context.rs
@@ -29,16 +29,37 @@ pub mod spr {
    pub const XER: u32 = 1;
    pub const LR: u32 = 8;
    pub const CTR: u32 = 9;
-    pub const TBL: u32 = 268;
-    pub const TBU: u32 = 269;
+    pub const DSISR: u32 = 18;
+    pub const DAR: u32 = 19;
+    /// Decrementer (hypervisor-visible, 32-bit down-counter).
+    pub const DEC: u32 = 22;
+    pub const TBL: u32 = 268;  // Read (user)
+    pub const TBU: u32 = 269;  // Read (user)
+    /// Time-base write (supervisor). Separate SPR number from TBL (268) for
+    /// access-control reasons.
+    pub const TBL_WRITE: u32 = 284;
+    pub const TBU_WRITE: u32 = 285;
    pub const SPRG0: u32 = 272;
    pub const SPRG1: u32 = 273;
    pub const SPRG2: u32 = 274;
    pub const SPRG3: u32 = 275;
+    pub const VRSAVE: u32 = 256;
    pub const PVR: u32 = 287;
+    pub const HID0: u32 = 1008;
+    pub const HID1: u32 = 1009;
    pub const PIR: u32 = 1023;
 }

+/// LR halt sentinel. When `bclr` returns to this address, the interpreter
+/// loop halts cleanly (matches the "entry returned" convention).
+pub const LR_HALT_SENTINEL: u64 = 0xBCBC_BCBC;
+
+/// VSCR NJ (Non-Java mode) bit. Stored in word 3 at bit 16 (mask 0x0001_0000).
+/// Set at startup; when clear, denormals are flushed to zero following IEEE-754.
+pub const VSCR_NJ_MASK: u32 = 0x0001_0000;
+/// VSCR SAT (saturation sticky) bit. Stored in word 3 at bit 31 (mask 0x0000_0001).
+pub const VSCR_SAT_MASK: u32 = 0x0000_0001;
+
 /// PowerPC processor context. Holds all register state for one guest thread.
 /// Mirrors PPCContext from ppc_context.h, minus JIT-specific fields.
 #[repr(C, align(64))]
@@ -64,15 +85,39 @@ pub struct PpcContext {
    pub xer_ca: u8,
    pub xer_ov: u8,
    pub xer_so: u8,
-    // Altivec VSCR saturation bit
-    pub vscr_sat: u8,
+    // Altivec VSCR. Only bits 16 (NJ) and 31 (SAT) of word 3 are meaningful.
+    pub vscr: Vec128,
+    // VRSAVE (SPR 256). Bitmask of which VRs need saving across context switches.
+    pub vrsave: u32,

    // Program counter
    pub pc: u32,
-    // Reservation address/value for lwarx/stwcx
-    pub reserved_addr: u32,
+    // Reservation for lwarx/ldarx/stwcx/stdcx. Xenon's reservation granule is
+    // one L2 cache line (128 bytes) — `reserved_line` is stored as the base
+    // address of that line (`ea & !0x7F`). `has_reservation` gates the
+    // validity; stwcx./stdcx. check that both match before committing.
+    // `reserved_val` is retained for possible future use by a coherency
+    // observer; the store-conditional logic itself does not compare it.
+    pub reserved_line: u32,
    pub reserved_val: u64,
    pub has_reservation: bool,
+    /// M3.7 — generation stamp returned by [`crate::ReservationTable::reserve`]
+    /// at the most recent `lwarx`/`ldarx`. Paired with `reserved_line`;
+    /// `stwcx.`/`stdcx.` pass this back to `try_commit`. Meaningful only
+    /// when `reservation_table` is `Some` and the table is enabled.
+    pub reserved_generation: u32,
+    /// M3.7 — optional handle to the inter-thread reservation table.
+    /// When `Some(table)` *and* `table.is_enabled()`, the interpreter's
+    /// `lwarx`/`stwcx.`/`ldarx`/`stdcx.` arms route through the table;
+    /// otherwise they use the legacy per-`PpcContext` fields above. The
+    /// scheduler populates this when it spawns a thread under a kernel
+    /// that has `reservations` set.
+    pub reservation_table: Option<std::sync::Arc<crate::ReservationTable>>,
+    /// M3.7 — emulated HW slot ID this thread is bound to. Used as the
+    /// reservation table's `hw_id` discriminator so two threads on
+    /// different slots can't accidentally commit each other's
+    /// reservations. Populated by the scheduler at spawn / migration.
+    pub hw_id: u8,

    // Thread ID (for kernel use)
    pub thread_id: u32,
@@ -82,6 +127,12 @@ pub struct PpcContext {

    // Time base (incremented each instruction for debugging)
    pub timebase: u64,
+
+    // Decrementer (SPR 22): 32-bit down-counter that fires an external
+    // interrupt at underflow on real hw. Xenia-rs doesn't dispatch DEC
+    // interrupts to the guest; this value is maintained so that mfspr DEC
+    // returns something coherent.
+    pub dec: u32,
 }

 impl PpcContext {
@@ -89,7 +140,9 @@ impl PpcContext {
        Self {
            gpr: [0; 32],
            ctr: 0,
-            lr: 0,
+            // Canary sets LR to the halt sentinel at thread start so `blr`
+            // from the top-level entry falls out of the interpreter loop.
+            lr: LR_HALT_SENTINEL,
            msr: 0,
            fpr: [0.0; 32],
            vr: [Vec128::ZERO; 128],
@@ -98,14 +151,21 @@ impl PpcContext {
            xer_ca: 0,
            xer_ov: 0,
            xer_so: 0,
-            vscr_sat: 0,
+            // VSCR starts with NJ bit set (denormals flushed) — matches canary
+            // thread_state.cc initialization.
+            vscr: Vec128::from_u32x4(0, 0, 0, VSCR_NJ_MASK),
+            vrsave: 0xFFFF_FFFF,
            pc: 0,
-            reserved_addr: 0,
+            reserved_line: 0,
            reserved_val: 0,
            has_reservation: false,
+            reserved_generation: 0,
+            reservation_table: None,
+            hw_id: 0,
            thread_id: 0,
            cycle_count: 0,
            timebase: 0,
+            dec: 0,
        }
    }

@@ -182,6 +242,27 @@ impl PpcContext {
        self.xer_ov = ((val >> 30) & 1) as u8;
        self.xer_ca = ((val >> 29) & 1) as u8;
    }
+
+    /// Read the VSCR SAT (sticky saturation) bit.
+    pub fn vscr_sat(&self) -> bool {
+        (self.vscr.u32x4(3) & VSCR_SAT_MASK) != 0
+    }
+
+    /// Set or clear VSCR SAT. Preserves the NJ bit (and any other word-3 bits).
+    pub fn set_vscr_sat(&mut self, v: bool) {
+        let mut w = self.vscr.u32x4(3);
+        if v {
+            w |= VSCR_SAT_MASK;
+        } else {
+            w &= !VSCR_SAT_MASK;
+        }
+        self.vscr.set_u32x4(3, w);
+    }
+
+    /// Read the VSCR NJ (non-Java mode / flush-denormals) bit.
+    pub fn vscr_nj(&self) -> bool {
+        (self.vscr.u32x4(3) & VSCR_NJ_MASK) != 0
+    }
 }

 impl Default for PpcContext {
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -77,6 +77,9 @@ impl DecodedInstr {
    /// OE bit (bit 21) - overflow enable
    #[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 }

+    /// TO field (bits 6-10) for tw/twi/td/tdi trap instructions.
+    #[inline] pub fn to(&self) -> u32 { extract_bits(self.raw, 6, 10) }
+
    /// MB, ME fields for rotate instructions
    #[inline] pub fn mb(&self) -> u32 { extract_bits(self.raw, 21, 25) }
    #[inline] pub fn me(&self) -> u32 { extract_bits(self.raw, 26, 30) }
@@ -142,6 +145,24 @@ impl DecodedInstr {
    #[inline] pub fn nb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
 }

+/// Extract the 5-bit `UIMM` (`VX128_3`) / `IMM` (`VX128_4`) field. Canary
+/// packs both formats with LSB-bits 16-20 holding the field, which is
+/// MSB bits 11-15 in our `extract_bits` convention. For `vpkd3d128` /
+/// `vupkd3d128` the decoded selector is `type = UIMM >> 2` (3 bits; valid
+/// values 0-6 per [`crate::vmx::D3dPackType`], 7 is undocumented /
+/// undefined in canary) and `pack = UIMM & 0x3` (output-slot layout for
+/// `vpkd3d128` only, `vupkd3d128` ignores it).
+///
+/// First-Pixels M3: the interpreter previously used a hand-rolled
+/// `(instr.raw >> 6) & 0x7` that was **LSB-numbered** and extracted
+/// bits from a completely different part of the word (the
+/// secondary-opcode region). Centralizing the extractor here matches
+/// canary's `FormatVX128_{3,4}::{UIMM,IMM}` field semantics exactly.
+#[inline]
+pub fn extract_vx128_uimm5(raw: u32) -> u32 {
+    extract_bits(raw, 11, 15)
+}
+
 /// Decode a 32-bit PPC instruction into its opcode.
 /// Direct translation of the C++ LookupOpcode from ppc_opcode_lookup_gen.cc.
 pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
@@ -149,6 +170,123 @@ pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
    DecodedInstr { opcode, raw, addr }
 }

+// Perf tier-2 — direct-mapped PC-keyed decode cache.
+//
+// The interpreter hot path spends ~15-25% of its time in `decode()`
+// parsing the raw u32 and walking the primary+secondary opcode tables.
+// For non-self-modifying guest code — the common case past the XEX
+// loader — `decode(raw, pc)` is purely a function of `(raw, pc)` and
+// the output is `Copy + 16B`. A direct-mapped cache indexed by
+// `(pc >> 2) & MASK` gives the interpreter a 1-comparison fast path,
+// at the cost of one branch and a 1.5 MiB region of memory.
+//
+// Invalidation piggybacks on `xenia_memory::GuestMemory::page_version`
+// (P5 texture-cache invalidation): every cache entry carries the page
+// version that was active at decode time; on lookup we compare against
+// the current version of the containing 4 KiB page. Any write to the
+// page bumps the counter, so the next decode on that PC is a miss that
+// refills.
+
+/// Number of direct-mapped entries. 2^16 = 65,536 slots, one PPC
+/// instruction address per slot — enough for every hot code path in a
+/// typical Xbox 360 title to stay resident without collision.
+const DECODE_CACHE_SIZE: usize = 1 << 16;
+const DECODE_CACHE_MASK: u32 = (DECODE_CACHE_SIZE - 1) as u32;
+
+#[derive(Clone, Copy)]
+struct DecodeCacheEntry {
+    /// Guest PC this entry was decoded at. Used as the tag on lookup; a
+    /// mismatch means the slot was last populated by a different PC that
+    /// shares the same low-16 index.
+    pc: u32,
+    /// Page version at decode time (from `GuestMemory::page_version(pc)`).
+    /// Zero means "unused slot" since real page versions start at 1.
+    page_version: u64,
+    decoded: DecodedInstr,
+}
+
+impl DecodeCacheEntry {
+    const fn empty() -> Self {
+        // `Invalid` is the decoder's "unrecognized opcode" sentinel; we
+        // use it here as the empty-slot marker. Real misses compare `pc`,
+        // not the opcode, so the sentinel choice is cosmetic.
+        Self {
+            pc: 0,
+            page_version: 0,
+            decoded: DecodedInstr {
+                opcode: PpcOpcode::Invalid,
+                raw: 0,
+                addr: 0,
+            },
+        }
+    }
+}
+
+/// Direct-mapped PC-keyed decode cache. One instance shared across all
+/// HW threads (PC is thread-independent; entries are read-only once
+/// filled). Not thread-safe — the single scheduler thread owns it.
+pub struct DecodeCache {
+    slots: Box<[DecodeCacheEntry]>,
+    hits: u64,
+    misses: u64,
+    invalidations: u64,
+}
+
+impl Default for DecodeCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl DecodeCache {
+    pub fn new() -> Self {
+        Self {
+            slots: vec![DecodeCacheEntry::empty(); DECODE_CACHE_SIZE].into_boxed_slice(),
+            hits: 0,
+            misses: 0,
+            invalidations: 0,
+        }
+    }
+
+    /// Look up (or fill) the decoded form of the instruction at `pc`.
+    /// `raw` is the fetched instruction word; `current_page_version` is
+    /// `mem.page_version(pc)` — the caller has it cheaper than we do,
+    /// since they're already touching `mem` to fetch `raw`.
+    #[inline]
+    pub fn lookup(&mut self, pc: u32, raw: u32, current_page_version: u64) -> DecodedInstr {
+        let idx = ((pc >> 2) & DECODE_CACHE_MASK) as usize;
+        // Safety: `idx` is masked into `[0, DECODE_CACHE_SIZE)` so the
+        // slice access is always in-bounds. Opt-out of the bounds check
+        // for the hot path.
+        let entry = unsafe { self.slots.get_unchecked_mut(idx) };
+        if entry.pc == pc && entry.page_version == current_page_version {
+            self.hits += 1;
+            return entry.decoded;
+        }
+        if entry.pc == pc && entry.page_version != current_page_version {
+            self.invalidations += 1;
+        }
+        self.misses += 1;
+        let decoded = decode(raw, pc);
+        *entry = DecodeCacheEntry {
+            pc,
+            page_version: current_page_version,
+            decoded,
+        };
+        decoded
+    }
+
+    pub fn hits(&self) -> u64 {
+        self.hits
+    }
+    pub fn misses(&self) -> u64 {
+        self.misses
+    }
+    pub fn invalidations(&self) -> u64 {
+        self.invalidations
+    }
+}
+
 fn lookup_opcode(code: u32) -> PpcOpcode {
    match extract_bits(code, 0, 5) {
        2 => PpcOpcode::tdi,
@@ -781,6 +919,57 @@ mod tests {
        assert_eq!(instr.d(), 0x20);
    }

+    #[test]
+    fn decode_cache_miss_fills_then_hit() {
+        let mut cache = DecodeCache::new();
+        let raw: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
+        let pc = 0x8200_0000u32;
+        let first = cache.lookup(pc, raw, 1);
+        assert_eq!(first.opcode, PpcOpcode::addi);
+        assert_eq!(cache.hits(), 0);
+        assert_eq!(cache.misses(), 1);
+        // Same pc, same version → cache hit, no new decode.
+        let second = cache.lookup(pc, raw, 1);
+        assert_eq!(second.opcode, PpcOpcode::addi);
+        assert_eq!(cache.hits(), 1);
+        assert_eq!(cache.misses(), 1);
+    }
+
+    #[test]
+    fn decode_cache_stale_version_refills() {
+        let mut cache = DecodeCache::new();
+        // First fill with an `addi`.
+        let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
+        let pc = 0x8200_0000u32;
+        cache.lookup(pc, raw_addi, 1);
+        // Guest rewrote the page: same pc, different raw + bumped version.
+        // Cache must refill — not return the stale `addi`.
+        let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
+        let refreshed = cache.lookup(pc, raw_lwz, 2);
+        assert_eq!(refreshed.opcode, PpcOpcode::lwz);
+        assert_eq!(cache.invalidations(), 1);
+        assert_eq!(cache.misses(), 2);
+    }
+
+    #[test]
+    fn decode_cache_pc_collision_refills() {
+        // Two PCs that hash to the same slot (pc >> 2 low 16 bits equal)
+        // must not alias. Slot index = ((pc >> 2) & 0xFFFF) — pick two
+        // PCs 4 * 2^16 bytes apart.
+        let mut cache = DecodeCache::new();
+        let pc_a = 0x8200_0000u32;
+        let pc_b = pc_a.wrapping_add(0x0004_0000u32); // (>> 2) differs by 2^16
+        let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
+        let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
+        cache.lookup(pc_a, raw_addi, 1);
+        // Different pc but same slot → miss + refill.
+        cache.lookup(pc_b, raw_lwz, 1);
+        // First pc comes back → miss + refill (slot was taken by pc_b).
+        let back = cache.lookup(pc_a, raw_addi, 1);
+        assert_eq!(back.opcode, PpcOpcode::addi);
+        assert_eq!(cache.misses(), 3);
+    }
+
    #[test]
    fn test_decode_branch() {
        // b +0x100 => opcode 18, LI=0x40 (shifted left 2 = 0x100), AA=0, LK=0
@@ -816,4 +1005,103 @@ mod tests {
        assert_eq!(extract_bits(0x8000_0000, 0, 0), 1);
        assert_eq!(extract_bits(0x0000_0001, 31, 31), 1);
    }
+
+    // VMX128 register-name extraction. Locks the canonical bit positions
+    // (decoder.rs is the single source of truth — the analysis crate's
+    // old `ppc.rs` had different positions, which produced wrong printed
+    // register names; the bug was silent because the interpreter never
+    // used those extractors). Each test poke-bits exactly the slots the
+    // accessor reads and asserts the assembled register number.
+
+    fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32,
+                       vb16_20: u32, vb28: u32, vb30: u32) -> u32 {
+        // PPC bit i -> LSB position 31-i.
+        (vd6_10 << (31 - 10))
+            | (va21  << (31 - 21))   // va128 high bit at PPC 29 in some forms — kept 0 here
+            | (vd21  << (31 - 21))
+            | (vd22  << (31 - 22))
+            | (vb16_20 << (31 - 20))
+            | (vb28  << (31 - 28))
+            | (vb30  << (31 - 30))
+    }
+
+    #[test]
+    fn vmx128_vd128_low_5_bits_only() {
+        // vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo
+        for r in 0..32u32 {
+            let raw = (r as u32) << (31 - 10);
+            let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+            assert_eq!(d.vd128(), r as usize, "vd_lo={r}");
+        }
+    }
+
+    #[test]
+    fn vmx128_vd128_bit21_adds_32() {
+        // vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32
+        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 32);
+    }
+
+    #[test]
+    fn vmx128_vd128_bit22_adds_64() {
+        // vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64
+        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 64);
+    }
+
+    #[test]
+    fn vmx128_vd128_full_127() {
+        // vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127
+        let raw = (31u32 << (31 - 10))
+            | (1u32 << (31 - 21))
+            | (1u32 << (31 - 22));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 127);
+    }
+
+    #[test]
+    fn vmx128_va128_uses_bit29() {
+        // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
+        let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.va128(), 39);
+    }
+
+    #[test]
+    fn vmx128_vb128_uses_bits28_and_30() {
+        // vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64.
+        let raw = (5u32 << (31 - 20))
+            | (1u32 << (31 - 28))
+            | (1u32 << (31 - 30));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vb128(), 5 | 32 | 64);
+    }
+
+    #[test]
+    fn vmx128_vs128_aliases_vd128() {
+        // vs128 must always equal vd128.
+        for r in [0u32, 31, 32, 64, 96, 127] {
+            let lo = r & 0x1F;
+            let b21 = (r >> 5) & 1;
+            let b22 = (r >> 6) & 1;
+            let raw = (lo << (31 - 10))
+                | (b21 << (31 - 21))
+                | (b22 << (31 - 22));
+            let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+            assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}");
+            assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}");
+            assert_eq!(d.vd128(), d.vs128());
+        }
+    }
+
+    #[test]
+    #[allow(dead_code)]
+    fn _vmx128_test_word_helper_compiles() {
+        // Keep the helper validated against the real accessor.
+        let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0);
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 5 | 32 | 64);
+    }
 }
--- a/crates/xenia-cpu/src/disasm.rs
+++ b/crates/xenia-cpu/src/disasm.rs
--- a/crates/xenia-cpu/src/fpscr.rs
+++ b/crates/xenia-cpu/src/fpscr.rs
@@ -0,0 +1,384 @@
+//! FPSCR (Floating-Point Status and Control Register) maintenance.
+//!
+//! Scope per project plan: rounding modes honoured, plus the exception bits
+//! games actually read (FX, FEX, VX, OX, UX, ZX, XX, FI, FPRF). Enabled-
+//! exception dispatch (FE[0,1], VE/OE/UE/ZE/XE) is *not* modelled — games
+//! running on Xenon almost never take FP traps.
+//!
+//! Bit layout (PowerISA, MSB-0 numbering; stored in a u32 with bit 31 = MSB):
+//!
+//! | PPC bit | u32 mask                | Name        |
+//! |---------|-------------------------|-------------|
+//! | 0       | `1<<31`                 | FX          |
+//! | 1       | `1<<30`                 | FEX         |
+//! | 2       | `1<<29`                 | VX (summary)|
+//! | 3       | `1<<28`                 | OX          |
+//! | 4       | `1<<27`                 | UX          |
+//! | 5       | `1<<26`                 | ZX          |
+//! | 6       | `1<<25`                 | XX          |
+//! | 7       | `1<<24`                 | VXSNAN      |
+//! | 8       | `1<<23`                 | VXISI       |
+//! | 9       | `1<<22`                 | VXIDI       |
+//! | 10      | `1<<21`                 | VXZDZ       |
+//! | 11      | `1<<20`                 | VXIMZ       |
+//! | 12      | `1<<19`                 | VXVC        |
+//! | 13      | `1<<18`                 | FR          |
+//! | 14      | `1<<17`                 | FI          |
+//! | 15..19  | `0xF8000 >> 15` @ 15..19 | FPRF (5 bits)|
+//! | 21      | `1<<10`                 | VXSOFT      |
+//! | 22      | `1<<9`                  | VXSQRT      |
+//! | 23      | `1<<8`                  | VXCVI       |
+//! | 30..31  | `0x3`                   | RN (2 bits) |
+
+use crate::context::PpcContext;
+
+pub const FX: u32      = 1 << 31;
+pub const FEX: u32     = 1 << 30;
+pub const VX: u32      = 1 << 29;
+pub const OX: u32      = 1 << 28;
+pub const UX: u32      = 1 << 27;
+pub const ZX: u32      = 1 << 26;
+pub const XX: u32      = 1 << 25;
+pub const VXSNAN: u32  = 1 << 24;
+pub const VXISI: u32   = 1 << 23;
+pub const VXIDI: u32   = 1 << 22;
+pub const VXZDZ: u32   = 1 << 21;
+pub const VXIMZ: u32   = 1 << 20;
+pub const VXVC: u32    = 1 << 19;
+pub const FR: u32      = 1 << 18;
+pub const FI: u32      = 1 << 17;
+pub const FPRF_MASK: u32 = 0x1F << 12;  // bits 15..19
+pub const VXSOFT: u32  = 1 << 10;
+pub const VXSQRT: u32  = 1 << 9;
+pub const VXCVI: u32   = 1 << 8;
+pub const RN_MASK: u32 = 0x3;
+
+/// Union of all VX* bits (used for the VX summary recomputation).
+pub const VX_ALL: u32 = VXSNAN | VXISI | VXIDI | VXZDZ | VXIMZ | VXVC | VXSOFT | VXSQRT | VXCVI;
+
+/// FPRF classification codes (5-bit, placed in FPSCR bits 15..19).
+/// The high bit ("C" in PowerISA) distinguishes ±zero/±denormal/QNaN from
+/// ±normal/±inf. The next 4 bits are (FL, FG, FE, FU) = (less, greater, equal, unordered).
+pub mod fprf {
+    pub const QNAN: u8         = 0b1_0001;
+    pub const NEG_INF: u8      = 0b0_1001;
+    pub const NEG_NORMAL: u8   = 0b0_1000;
+    pub const NEG_DENORMAL: u8 = 0b1_1000;
+    pub const NEG_ZERO: u8     = 0b1_0010;
+    pub const POS_ZERO: u8     = 0b0_0010;
+    pub const POS_DENORMAL: u8 = 0b1_0100;
+    pub const POS_NORMAL: u8   = 0b0_0100;
+    pub const POS_INF: u8      = 0b0_0101;
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RoundingMode {
+    NearestEven,    // RN=00
+    TowardZero,     // RN=01
+    TowardPosInf,   // RN=10
+    TowardNegInf,   // RN=11
+}
+
+pub fn rounding_mode(ctx: &PpcContext) -> RoundingMode {
+    match ctx.fpscr & RN_MASK {
+        0 => RoundingMode::NearestEven,
+        1 => RoundingMode::TowardZero,
+        2 => RoundingMode::TowardPosInf,
+        _ => RoundingMode::TowardNegInf,
+    }
+}
+
+/// Classify a finite f64 into its FPRF 5-bit code.
+pub fn classify_fprf(v: f64) -> u8 {
+    if v.is_nan() {
+        fprf::QNAN
+    } else if v.is_infinite() {
+        if v.is_sign_negative() { fprf::NEG_INF } else { fprf::POS_INF }
+    } else if v == 0.0 {
+        if v.is_sign_negative() { fprf::NEG_ZERO } else { fprf::POS_ZERO }
+    } else if v.is_subnormal() {
+        if v.is_sign_negative() { fprf::NEG_DENORMAL } else { fprf::POS_DENORMAL }
+    } else if v.is_sign_negative() { fprf::NEG_NORMAL } else { fprf::POS_NORMAL }
+}
+
+/// Write FPRF into FPSCR, preserving other bits.
+pub fn set_fprf(ctx: &mut PpcContext, code: u8) {
+    ctx.fpscr = (ctx.fpscr & !FPRF_MASK) | ((code as u32 & 0x1F) << 12);
+}
+
+/// Set one or more exception bits on FPSCR, maintaining FX (sticky set on any
+/// new exception) and VX (summary of VX* bits).
+pub fn set_exception(ctx: &mut PpcContext, bits: u32) {
+    let prev = ctx.fpscr;
+    let new = prev | bits;
+    // FX is sticky-set if any new non-sticky bit transitions to 1. PPC defines
+    // FX as "any of OX, UX, ZX, XX, VX* newly set". Compute the transition set.
+    let transition = (new & !prev) & (OX | UX | ZX | XX | VX_ALL);
+    let mut updated = new;
+    if transition != 0 {
+        updated |= FX;
+    }
+    // Recompute VX summary from any VX* bits currently set.
+    if (updated & VX_ALL) != 0 { updated |= VX; }
+    ctx.fpscr = updated;
+}
+
+/// Classify the inputs of a floating-point arithmetic op and set appropriate
+/// VX* bits. Returns true if any invalid-operation was detected (caller may
+/// want to write a default QNaN result).
+///
+/// Detected cases:
+///   * any SNaN input → VXSNAN
+///   * infinity - infinity (same sign) → VXISI
+///   * 0 / 0 → VXZDZ
+///   * infinity / infinity → VXIDI
+///   * 0 * infinity → VXIMZ
+pub fn check_invalid_add(ctx: &mut PpcContext, a: f64, b: f64, sub: bool) -> bool {
+    let mut bits = 0u32;
+    if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
+    if a.is_infinite() && b.is_infinite() {
+        // For add: VXISI iff same-sign(a,b) negated — inf - inf
+        // For sub: VXISI iff same-sign(a,b) — (+inf) - (+inf) or (-inf) - (-inf)
+        let both_pos = a.is_sign_positive() && b.is_sign_positive();
+        let both_neg = a.is_sign_negative() && b.is_sign_negative();
+        if sub {
+            if both_pos || both_neg { bits |= VXISI; }
+        } else {
+            // add: opposite signs cancel to inf-inf
+            if a.is_sign_positive() != b.is_sign_positive() { bits |= VXISI; }
+        }
+    }
+    if bits != 0 { set_exception(ctx, bits); return true; }
+    false
+}
+
+pub fn check_invalid_mul(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
+    let mut bits = 0u32;
+    if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
+    let zero_times_inf =
+        (a == 0.0 && b.is_infinite()) || (b == 0.0 && a.is_infinite());
+    if zero_times_inf { bits |= VXIMZ; }
+    if bits != 0 { set_exception(ctx, bits); return true; }
+    false
+}
+
+pub fn check_invalid_div(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
+    let mut bits = 0u32;
+    if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
+    if a == 0.0 && b == 0.0 { bits |= VXZDZ; }
+    if a.is_infinite() && b.is_infinite() { bits |= VXIDI; }
+    if bits != 0 { set_exception(ctx, bits); return true; }
+    false
+}
+
+/// Divide-by-zero (finite nonzero / 0) — sets ZX but not VX.
+pub fn check_zero_divide(ctx: &mut PpcContext, a: f64, b: f64) {
+    if b == 0.0 && a != 0.0 && !a.is_nan() && !a.is_infinite() {
+        set_exception(ctx, ZX);
+    }
+}
+
+/// Post-op: classify the result and update FPRF + detect overflow/underflow/inexact.
+/// `inputs_finite` lets us suppress OX for ops whose output is infinite because
+/// an input already was.
+pub fn update_after_op(ctx: &mut PpcContext, result: f64, inputs_were_finite: bool) {
+    let mut bits = 0u32;
+    if result.is_infinite() && inputs_were_finite {
+        bits |= OX;
+    }
+    if result.is_subnormal() {
+        bits |= UX;
+    }
+    if bits != 0 { set_exception(ctx, bits); }
+    set_fprf(ctx, classify_fprf(result));
+}
+
+/// Test whether an f64 is a signalling NaN.
+/// In IEEE 754-2008 (binary64), the signalling bit is the high bit of the
+/// mantissa. SNaN has it clear, QNaN has it set. NaN with high mantissa bit
+/// clear (and mantissa nonzero) is an SNaN.
+pub fn is_snan(x: f64) -> bool {
+    if !x.is_nan() { return false; }
+    let bits = x.to_bits();
+    // Highest mantissa bit (bit 51) clear ⇒ SNaN. Mantissa nonzero always true for NaN.
+    (bits & (1u64 << 51)) == 0
+}
+
+/// Round an f64 to f32 honouring FPSCR[RN]. Uses the current hardware
+/// rounding mode when RN=0 (nearest-even, the PPC default), otherwise
+/// emulates the directed rounding via bit-manipulation.
+pub fn round_to_single(ctx: &PpcContext, v: f64) -> f64 {
+    match rounding_mode(ctx) {
+        RoundingMode::NearestEven => (v as f32) as f64,
+        RoundingMode::TowardZero => round_single_toward_zero(v) as f64,
+        RoundingMode::TowardPosInf => round_single_toward_pos_inf(v) as f64,
+        RoundingMode::TowardNegInf => round_single_toward_neg_inf(v) as f64,
+    }
+}
+
+/// Round an f64 to an i64 integer honouring FPSCR[RN]. Used by fctidx.
+pub fn round_to_i64(ctx: &PpcContext, v: f64) -> i64 {
+    match rounding_mode(ctx) {
+        RoundingMode::NearestEven => {
+            // Round-half-to-even (banker's rounding).
+            let r = v.round();
+            // Rust's f64::round is round-half-away-from-zero. Correct ties to even:
+            let diff = (v - v.trunc()).abs();
+            if (diff - 0.5).abs() < f64::EPSILON {
+                let floor = v.floor();
+                if (floor as i64) & 1 == 0 { floor as i64 } else { v.ceil() as i64 }
+            } else {
+                r as i64
+            }
+        }
+        RoundingMode::TowardZero => v.trunc() as i64,
+        RoundingMode::TowardPosInf => v.ceil() as i64,
+        RoundingMode::TowardNegInf => v.floor() as i64,
+    }
+}
+
+/// Round an f64 to an i32 integer honouring FPSCR[RN]. Used by fctiwx.
+pub fn round_to_i32(ctx: &PpcContext, v: f64) -> i32 {
+    round_to_i64(ctx, v).clamp(i32::MIN as i64, i32::MAX as i64) as i32
+}
+
+// ------ directed rounding helpers (f64 → f32) ------
+
+fn round_single_toward_zero(v: f64) -> f32 {
+    // Default f64→f32 is round-to-nearest-even. Emulate truncation:
+    // take the default rounded value; if the absolute rounded magnitude
+    // exceeds |v|, bump down by one ULP toward zero.
+    let rn = v as f32;
+    if rn.is_nan() || rn.is_infinite() || rn == 0.0 { return rn; }
+    if rn.abs() as f64 <= v.abs() { return rn; }
+    let adj_bits = rn.to_bits();
+    let lower = if rn.is_sign_positive() { adj_bits - 1 } else { adj_bits - 1 };
+    f32::from_bits(lower)
+}
+
+fn round_single_toward_pos_inf(v: f64) -> f32 {
+    let rn = v as f32;
+    if rn.is_nan() || rn.is_infinite() { return rn; }
+    if (rn as f64) >= v { return rn; }
+    // rn < v — bump up by one ULP in the +direction.
+    let b = rn.to_bits();
+    let nb = if rn.is_sign_negative() { b - 1 } else { b + 1 };
+    f32::from_bits(nb)
+}
+
+fn round_single_toward_neg_inf(v: f64) -> f32 {
+    let rn = v as f32;
+    if rn.is_nan() || rn.is_infinite() { return rn; }
+    if (rn as f64) <= v { return rn; }
+    // rn > v — bump down.
+    let b = rn.to_bits();
+    let nb = if rn.is_sign_negative() { b + 1 } else { b - 1 };
+    f32::from_bits(nb)
+}
+
+/// Drop-in replacement for the old `update_cr1_from_fpscr`. Reads the
+/// currently-maintained FPSCR bits (FX, FEX, VX, OX) into CR1.
+pub fn update_cr1(ctx: &mut PpcContext) {
+    ctx.cr[1].lt = (ctx.fpscr & FX) != 0;
+    ctx.cr[1].gt = (ctx.fpscr & FEX) != 0;
+    ctx.cr[1].eq = (ctx.fpscr & VX) != 0;
+    ctx.cr[1].so = (ctx.fpscr & OX) != 0;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn ctx() -> PpcContext { PpcContext::new() }
+
+    #[test]
+    fn rn_default_is_nearest() {
+        assert_eq!(rounding_mode(&ctx()), RoundingMode::NearestEven);
+    }
+
+    #[test]
+    fn rn_bits_decode() {
+        let mut c = ctx();
+        c.fpscr = 0x1;
+        assert_eq!(rounding_mode(&c), RoundingMode::TowardZero);
+        c.fpscr = 0x2;
+        assert_eq!(rounding_mode(&c), RoundingMode::TowardPosInf);
+        c.fpscr = 0x3;
+        assert_eq!(rounding_mode(&c), RoundingMode::TowardNegInf);
+    }
+
+    #[test]
+    fn fprf_classifies_correctly() {
+        assert_eq!(classify_fprf(1.0), fprf::POS_NORMAL);
+        assert_eq!(classify_fprf(-1.0), fprf::NEG_NORMAL);
+        assert_eq!(classify_fprf(0.0), fprf::POS_ZERO);
+        assert_eq!(classify_fprf(-0.0), fprf::NEG_ZERO);
+        assert_eq!(classify_fprf(f64::INFINITY), fprf::POS_INF);
+        assert_eq!(classify_fprf(f64::NEG_INFINITY), fprf::NEG_INF);
+        assert_eq!(classify_fprf(f64::NAN), fprf::QNAN);
+        assert_eq!(classify_fprf(f64::from_bits(1)), fprf::POS_DENORMAL);
+    }
+
+    #[test]
+    fn fx_is_sticky_on_new_exception() {
+        let mut c = ctx();
+        set_exception(&mut c, OX);
+        assert_ne!(c.fpscr & FX, 0);
+        // Clear FX/OX manually.
+        c.fpscr &= !(FX | OX);
+        // Re-set OX; FX should re-latch.
+        set_exception(&mut c, OX);
+        assert_ne!(c.fpscr & FX, 0);
+    }
+
+    #[test]
+    fn vx_summary_set_on_any_vx_bit() {
+        let mut c = ctx();
+        set_exception(&mut c, VXSNAN);
+        assert_ne!(c.fpscr & VX, 0);
+        assert_ne!(c.fpscr & VXSNAN, 0);
+    }
+
+    #[test]
+    fn round_to_single_nearest_is_identity_on_representable() {
+        let c = ctx();
+        assert_eq!(round_to_single(&c, 1.0_f64), 1.0_f64);
+    }
+
+    #[test]
+    fn round_to_i32_clamps_out_of_range() {
+        let c = ctx();
+        assert_eq!(round_to_i32(&c, 1e20_f64), i32::MAX);
+        assert_eq!(round_to_i32(&c, -1e20_f64), i32::MIN);
+    }
+
+    #[test]
+    fn round_to_i64_nearest_even_on_tie() {
+        let c = ctx();
+        assert_eq!(round_to_i64(&c, 2.5_f64), 2);
+        assert_eq!(round_to_i64(&c, 3.5_f64), 4);
+        assert_eq!(round_to_i64(&c, -2.5_f64), -2);
+    }
+
+    #[test]
+    fn check_invalid_add_detects_inf_minus_inf() {
+        let mut c = ctx();
+        assert!(check_invalid_add(&mut c, f64::INFINITY, f64::INFINITY, true));
+        assert_ne!(c.fpscr & VXISI, 0);
+    }
+
+    #[test]
+    fn check_invalid_div_detects_zero_over_zero() {
+        let mut c = ctx();
+        assert!(check_invalid_div(&mut c, 0.0, 0.0));
+        assert_ne!(c.fpscr & VXZDZ, 0);
+    }
+
+    #[test]
+    fn snan_detection() {
+        // SNaN in binary64: sign=0, exp=all-ones, mantissa nonzero with bit 51 clear.
+        let snan = f64::from_bits(0x7FF0_0000_0000_0001);
+        assert!(is_snan(snan));
+        assert!(!is_snan(f64::NAN));
+    }
+}
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
--- a/crates/xenia-cpu/src/lib.rs
+++ b/crates/xenia-cpu/src/lib.rs
@@ -1,9 +1,25 @@
+pub mod block_cache;
 pub mod context;
 pub mod decoder;
 pub mod disasm;
+pub mod fpscr;
 pub mod interpreter;
 pub mod opcode;
+pub mod overflow;
+pub mod phaser;
+pub mod reservation;
+pub mod scheduler;
+pub mod trap;
+pub mod vmx;

 pub use context::PpcContext;
 pub use decoder::decode;
+pub use disasm::{DisasmItem, DisasmText, disassemble, format as disasm_format, iter_disasm};
 pub use opcode::PpcOpcode;
+pub use phaser::{Phaser, PhaserOutcome};
+pub use reservation::ReservationTable;
+pub use scheduler::{
+    BlockReason, GuestThread, HwSlot, HwState, MigrationFixup, OrderMode, PcrWriter, RoundOutcome,
+    Scheduler, SpawnError, SpawnParams, ThreadRef, HW_THREAD_COUNT, INITIAL_GUEST_TID,
+    QUANTUM_DEFAULT,
+};
--- a/crates/xenia-cpu/src/opcode.rs
+++ b/crates/xenia-cpu/src/opcode.rs
@@ -145,6 +145,33 @@ impl PpcOpcode {
        matches!(self, Self::sc)
    }

+    /// Returns true if this opcode unconditionally ends a basic block:
+    /// any branch, system call, trap, or `Invalid` (decoder couldn't
+    /// recognize the instruction — execution will hit the
+    /// `Unimplemented` arm and we don't want to swallow the boundary
+    /// inside a cached block).
+    ///
+    /// Notably *not* terminating: `mtmsr`/`mtmsrd`/`isync`/`mfmsr`.
+    /// On real hardware these have synchronization semantics (a context
+    /// synchronizing event for `isync`, MSR rewrite for the `mt*`s) but
+    /// our interpreter has no asynchronous-exception model and no
+    /// out-of-order execution — they execute as plain ALU/move ops and
+    /// don't change control flow synchronously. Block-cache replay is
+    /// still bit-for-bit identical to per-instruction dispatch for
+    /// those.
+    ///
+    /// Used by the basic-block cache (`block_cache.rs`) to know when to
+    /// stop accumulating instructions during a forward decode walk.
+    pub fn terminates_block(&self) -> bool {
+        matches!(
+            self,
+            Self::bx | Self::bcx | Self::bclrx | Self::bcctrx
+                | Self::sc
+                | Self::td | Self::tdi | Self::tw | Self::twi
+                | Self::Invalid
+        )
+    }
+
    /// Returns true if this is a load instruction.
    pub fn is_load(&self) -> bool {
        matches!(self,
@@ -194,3 +221,60 @@ impl std::fmt::Display for PpcOpcode {
        std::fmt::Debug::fmt(self, f)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn terminates_block_includes_all_branches() {
+        assert!(PpcOpcode::bx.terminates_block());
+        assert!(PpcOpcode::bcx.terminates_block());
+        assert!(PpcOpcode::bclrx.terminates_block());
+        assert!(PpcOpcode::bcctrx.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_includes_sc_and_traps() {
+        assert!(PpcOpcode::sc.terminates_block());
+        assert!(PpcOpcode::td.terminates_block());
+        assert!(PpcOpcode::tdi.terminates_block());
+        assert!(PpcOpcode::tw.terminates_block());
+        assert!(PpcOpcode::twi.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_includes_invalid() {
+        // Decoder failure must end the block — otherwise an unknown
+        // opcode would be replayed inside a cached block without going
+        // through the per-instruction Unimplemented path.
+        assert!(PpcOpcode::Invalid.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_excludes_straight_line_ops() {
+        // Common ALU and load/store ops must NOT terminate a block.
+        assert!(!PpcOpcode::addi.terminates_block());
+        assert!(!PpcOpcode::addis.terminates_block());
+        assert!(!PpcOpcode::addx.terminates_block());
+        assert!(!PpcOpcode::cmpi.terminates_block());
+        assert!(!PpcOpcode::cmp.terminates_block());
+        assert!(!PpcOpcode::lwz.terminates_block());
+        assert!(!PpcOpcode::stw.terminates_block());
+        assert!(!PpcOpcode::lbzx.terminates_block());
+        assert!(!PpcOpcode::ori.terminates_block());
+        assert!(!PpcOpcode::oris.terminates_block());
+        assert!(!PpcOpcode::rlwinmx.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_excludes_msr_and_sync_ops() {
+        // Documented decision: synchronizing ops execute as ALU within
+        // a block since the interpreter has no async-exception model.
+        assert!(!PpcOpcode::mtmsr.terminates_block());
+        assert!(!PpcOpcode::mtmsrd.terminates_block());
+        assert!(!PpcOpcode::isync.terminates_block());
+        assert!(!PpcOpcode::sync.terminates_block());
+        assert!(!PpcOpcode::mfmsr.terminates_block());
+    }
+}
--- a/crates/xenia-cpu/src/overflow.rs
+++ b/crates/xenia-cpu/src/overflow.rs
@@ -0,0 +1,173 @@
+//! OE / XER[OV] / XER[SO] handling for integer arithmetic.
+//!
+//! PPC integer ops with the OE bit set update XER[OV] (overflow) and sticky-set
+//! XER[SO]. When OE is clear the instruction leaves XER untouched. Signed
+//! overflow is predicated on the operation width and operand signs per the
+//! PowerISA pseudocode. For 32-bit-word operations (`addw`, `mullw`, `divw`,
+//! `neg`, etc. — on PPC these all have `w` in the mnemonic in spec
+//! descriptions even when the assembler spells them without) the predicate
+//! uses the low 32 bits. For 64-bit operations (`add`, `mulld`, `divd`) the
+//! predicate uses the full 64 bits.
+
+use crate::context::PpcContext;
+
+#[inline]
+pub fn apply(ctx: &mut PpcContext, overflowed: bool) {
+    if overflowed {
+        ctx.xer_ov = 1;
+        ctx.xer_so = 1;
+    } else {
+        ctx.xer_ov = 0;
+    }
+}
+
+/// Signed addition overflow at width-64 (plain `add`, `addc`, `subf`, `subfc`).
+///
+/// Predicate: same-sign inputs with opposite-sign result.
+/// For sub callers, rewrite as `a + b'` first (see `_sub`).
+#[inline]
+pub fn add_ov_64(a: u64, b: u64, result: u64) -> bool {
+    ((!(a ^ b)) & (a ^ result)) >> 63 != 0
+}
+
+/// Universal signed-overflow predicate for 64-bit arithmetic.
+///
+/// Caller computes the mathematical (infinite-precision) signed sum as i128,
+/// plus the stored 64-bit result. Overflow iff the two disagree — i.e. the
+/// true value doesn't fit in i64.
+///
+/// Use this for multi-term chains (`adde`, `addme`, `addze`, `subfe`, `subfme`,
+/// `subfze`) where the carry-in makes the bit-predicate above awkward.
+#[inline]
+pub fn sum_overflow_64(true_sum: i128, result: u64) -> bool {
+    true_sum != (result as i64) as i128
+}
+
+/// Signed subtraction: RT = b - a. Overflow iff opposite-sign inputs with
+/// result sign != b's sign. Equivalently, reduce to addition with `!a + 1`.
+#[inline]
+pub fn sub_ov_64(a: u64, b: u64, result: u64) -> bool {
+    ((a ^ b) & (b ^ result)) >> 63 != 0
+}
+
+/// Signed `addc`/`adde` chain overflow. Same rule as `add_ov_64` — the carry
+/// in doesn't alter the sign predicate directly because it's already folded
+/// into the stored result.
+#[inline]
+pub fn adde_ov_64(a: u64, b: u64, result: u64) -> bool {
+    add_ov_64(a, b, result)
+}
+
+/// Signed 32-bit multiply overflow (`mullwo`): result fits in 32 bits signed
+/// iff bit 32 equals bits 33..63 of the 64-bit product.
+#[inline]
+pub fn mullw_ov(product: i64) -> bool {
+    let lo = product as i32 as i64;
+    lo != product
+}
+
+/// Signed 64-bit multiply overflow (`mulldo`). Detected via checked_mul.
+#[inline]
+pub fn mulld_ov(a: i64, b: i64) -> bool {
+    a.checked_mul(b).is_none()
+}
+
+/// `divwo` / `divwuo` / `divdo` / `divduo` raise OV in two cases:
+///   * divisor is zero, or
+///   * signed division of `INT_MIN / -1` (quotient doesn't fit).
+#[inline]
+pub fn divw_ov_signed(ra: i32, rb: i32) -> bool {
+    rb == 0 || (ra == i32::MIN && rb == -1)
+}
+
+#[inline]
+pub fn divw_ov_unsigned(rb: u32) -> bool {
+    rb == 0
+}
+
+#[inline]
+pub fn divd_ov_signed(ra: i64, rb: i64) -> bool {
+    rb == 0 || (ra == i64::MIN && rb == -1)
+}
+
+#[inline]
+pub fn divd_ov_unsigned(rb: u64) -> bool {
+    rb == 0
+}
+
+/// `negx`: RT = -(RA). Overflow only when RA = INT_MIN (the negation doesn't fit).
+#[inline]
+pub fn neg_ov_64(ra: u64) -> bool {
+    ra == 0x8000_0000_0000_0000
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn add_no_overflow() {
+        assert!(!add_ov_64(1, 2, 3));
+        assert!(!add_ov_64(u64::MAX, 0, u64::MAX));
+    }
+
+    #[test]
+    fn add_positive_overflow() {
+        // INT64_MAX + 1 = INT64_MIN — signed overflow
+        let a = i64::MAX as u64;
+        let b = 1u64;
+        let r = a.wrapping_add(b);
+        assert!(add_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn add_negative_overflow() {
+        // INT64_MIN + -1 = INT64_MAX — signed overflow
+        let a = i64::MIN as u64;
+        let b = (-1i64) as u64;
+        let r = a.wrapping_add(b);
+        assert!(add_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn sub_overflow_min_minus_pos() {
+        // INT64_MIN - 1 overflows
+        let b = i64::MIN as u64;
+        let a = 1u64;
+        let r = b.wrapping_sub(a);
+        assert!(sub_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn sub_no_overflow() {
+        let b = 5u64;
+        let a = 2u64;
+        let r = b.wrapping_sub(a);
+        assert!(!sub_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn mullw_fits_32_bits() {
+        assert!(!mullw_ov((i32::MAX as i64) * 1));
+        assert!(!mullw_ov(-1i64));
+    }
+
+    #[test]
+    fn mullw_overflows_32_bits() {
+        let p = (i32::MAX as i64) * 2;
+        assert!(mullw_ov(p));
+    }
+
+    #[test]
+    fn mulld_overflows() {
+        assert!(mulld_ov(i64::MAX, 2));
+        assert!(!mulld_ov(i64::MAX, 1));
+    }
+
+    #[test]
+    fn neg_ov_only_at_min() {
+        assert!(neg_ov_64(i64::MIN as u64));
+        assert!(!neg_ov_64(0));
+        assert!(!neg_ov_64(1));
+    }
+}
--- a/crates/xenia-cpu/src/phaser.rs
+++ b/crates/xenia-cpu/src/phaser.rs
@@ -0,0 +1,345 @@
+//! Quantum-boundary phaser for the M3 per-HW-thread parallel scheduler.
+//!
+//! Six [`super::HW_THREAD_COUNT`] host threads run their slots' interpreters
+//! in parallel, then meet at a phaser to advance to the next quantum. This
+//! is **not** [`std::sync::Barrier`]: a Barrier needs a fixed party count,
+//! but our slots can become idle (no runnable thread) and shouldn't block
+//! the phaser arrival.
+//!
+//! ## Semantics
+//!
+//! - Each slot at the end of its quantum either calls
+//!   [`Phaser::arrive_and_wait`] (it has a runnable thread to run next
+//!   quantum) or [`Phaser::skip`] (it's idle this round and will wake on
+//!   `slot_wake[i]`).
+//! - The phase advances when **all 6 slots have either arrived or
+//!   skipped**. Arrived slots block until the advance; skipped slots
+//!   return immediately and re-poll their wake state.
+//! - The phaser uses a generation counter so a slot that arrives "early"
+//!   in the next phase doesn't see the prior phase's "all arrived"
+//!   condition.
+//! - Defensive timeout: [`Phaser::arrive_and_wait_timeout`] returns
+//!   [`PhaserOutcome::Timeout`] if a peer crashes / hangs. Callers
+//!   typically convert this into a graceful shutdown rather than
+//!   panicking, so the rest of the topology can tear down cleanly.
+//!
+//! ## Memory ordering
+//!
+//! - The participant counter (`arrived` + `skipped`) uses `AcqRel` on
+//!   the increment so the last-to-arrive thread sees a consistent
+//!   "everyone is here" snapshot.
+//! - The generation `phase` is read with `Acquire` in arrivers' wait
+//!   loops; the advancing thread stores with `Release` after bumping.
+//! - The condvar's broadcast publishes the phase; the wait loop
+//!   re-checks `phase` against its captured value to defend against
+//!   spurious wakeups.
+
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::{Condvar, Mutex};
+use std::time::{Duration, Instant};
+
+/// Outcome of a phaser arrival.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PhaserOutcome {
+    /// All participants arrived/skipped — phase advanced. Caller proceeds
+    /// into the next quantum.
+    Advanced,
+    /// Defensive timeout fired before all peers arrived. Caller should
+    /// log + initiate shutdown rather than retry.
+    Timeout,
+    /// Phaser was shut down via [`Phaser::shutdown`]; all waiters are
+    /// woken and return this. Caller exits cleanly.
+    Shutdown,
+}
+
+/// Custom barrier-with-skip primitive. Construct once with the number of
+/// participating slots; share via `Arc` across host threads.
+pub struct Phaser {
+    /// Total participant count (constant after construction). For our
+    /// scheduler this is `HW_THREAD_COUNT = 6`.
+    party_count: u32,
+    /// Monotonic phase counter, incremented every time the phase
+    /// advances. Used as a generation marker so a slot that wakes "into"
+    /// the next phase doesn't observe the old "everyone arrived" state.
+    phase: AtomicU32,
+    /// Inner state guarded by the condvar's mutex.
+    inner: Mutex<Inner>,
+    /// Notified when a phase advances or shutdown fires.
+    cv: Condvar,
+}
+
+#[derive(Debug)]
+struct Inner {
+    arrived_or_skipped: u32,
+    shutdown: bool,
+}
+
+impl Phaser {
+    /// Create a phaser with `party_count` participants. Panics if
+    /// `party_count == 0`.
+    pub fn new(party_count: u32) -> Self {
+        assert!(party_count > 0, "phaser party_count must be > 0");
+        Self {
+            party_count,
+            phase: AtomicU32::new(0),
+            inner: Mutex::new(Inner {
+                arrived_or_skipped: 0,
+                shutdown: false,
+            }),
+            cv: Condvar::new(),
+        }
+    }
+
+    /// Get the current phase number. Useful for tests and observability.
+    pub fn current_phase(&self) -> u32 {
+        self.phase.load(Ordering::Acquire)
+    }
+
+    /// Mark this slot as not participating in the current phase. Counts
+    /// toward the advance threshold but does not block. Used when a slot
+    /// has no runnable thread and is parked waiting on
+    /// `slot_wake[i].unpark()`.
+    ///
+    /// `_slot_id` is informational (not stored); the parameter exists so
+    /// call sites stay greppable.
+    pub fn skip(&self, _slot_id: u8) {
+        self.contribute_advance();
+    }
+
+    /// Block until the phase advances or the defensive 5-second timeout
+    /// fires. Returns [`PhaserOutcome::Advanced`] on a clean phase
+    /// transition; [`Timeout`] if a peer hung; [`Shutdown`] on tear-down.
+    ///
+    /// `_slot_id` is informational (see [`Self::skip`]).
+    pub fn arrive_and_wait(&self, _slot_id: u8) -> PhaserOutcome {
+        self.arrive_and_wait_timeout(Duration::from_secs(5))
+    }
+
+    /// Same as [`Self::arrive_and_wait`] with a caller-supplied timeout.
+    pub fn arrive_and_wait_timeout(&self, timeout: Duration) -> PhaserOutcome {
+        let pre_phase = self.phase.load(Ordering::Acquire);
+        self.contribute_advance();
+        let deadline = Instant::now() + timeout;
+        let mut guard = self.inner.lock().unwrap();
+        loop {
+            if guard.shutdown {
+                return PhaserOutcome::Shutdown;
+            }
+            if self.phase.load(Ordering::Acquire) != pre_phase {
+                return PhaserOutcome::Advanced;
+            }
+            let now = Instant::now();
+            if now >= deadline {
+                return PhaserOutcome::Timeout;
+            }
+            let remaining = deadline - now;
+            let result = self.cv.wait_timeout(guard, remaining).unwrap();
+            guard = result.0;
+            if result.1.timed_out() {
+                // Loop once more to disambiguate "real timeout" vs
+                // "spurious wakeup just before the deadline".
+                if self.phase.load(Ordering::Acquire) != pre_phase {
+                    return PhaserOutcome::Advanced;
+                }
+                if guard.shutdown {
+                    return PhaserOutcome::Shutdown;
+                }
+                return PhaserOutcome::Timeout;
+            }
+        }
+    }
+
+    /// Wake every parked arriver and signal shutdown. After this, all
+    /// future and outstanding `arrive_and_wait_*` calls return
+    /// [`PhaserOutcome::Shutdown`].
+    pub fn shutdown(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.shutdown = true;
+        self.cv.notify_all();
+    }
+
+    /// Common path for both arrive-and-wait and skip: bump the
+    /// participant counter, and if we were the last one in, advance the
+    /// phase + broadcast.
+    fn contribute_advance(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.arrived_or_skipped += 1;
+        if guard.arrived_or_skipped >= self.party_count {
+            // Last one in. Reset the counter, bump the phase, broadcast.
+            guard.arrived_or_skipped = 0;
+            // `Release` on the phase store pairs with `Acquire` reads in
+            // arriving slots' wait-loop predicates.
+            self.phase.fetch_add(1, Ordering::Release);
+            self.cv.notify_all();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::sync::atomic::AtomicU32;
+    use std::thread;
+
+    /// All N participants arrive — phase advances, every arriver returns
+    /// `Advanced`.
+    #[test]
+    fn n_arrivers_all_advance() {
+        const N: u32 = 6;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        for i in 0..N {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-test-{i}"))
+                    .spawn(move || p.arrive_and_wait(i as u8))
+                    .unwrap(),
+            );
+        }
+        for h in handles {
+            assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
+        }
+        assert_eq!(p.current_phase(), 1);
+    }
+
+    /// 5 arrive + 1 skip → phase advances; arrivers see `Advanced`.
+    #[test]
+    fn skip_counts_toward_advance() {
+        const N: u32 = 6;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        for i in 0..(N - 1) {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-arrive-{i}"))
+                    .spawn(move || p.arrive_and_wait(i as u8))
+                    .unwrap(),
+            );
+        }
+        // Brief pause to let arrivers park first (exercising the
+        // skip-unblocks-arrivers path).
+        thread::sleep(Duration::from_millis(20));
+        p.skip((N - 1) as u8);
+        for h in handles {
+            assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
+        }
+        assert_eq!(p.current_phase(), 1);
+    }
+
+    /// Shutdown wakes parked arrivers; they return `Shutdown`.
+    #[test]
+    fn shutdown_wakes_arrivers() {
+        const N: u32 = 6;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        // Only N-1 arrive — phase will not advance.
+        for i in 0..(N - 1) {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-arrive-shutdown-{i}"))
+                    .spawn(move || p.arrive_and_wait(i as u8))
+                    .unwrap(),
+            );
+        }
+        thread::sleep(Duration::from_millis(20));
+        p.shutdown();
+        for h in handles {
+            assert_eq!(h.join().unwrap(), PhaserOutcome::Shutdown);
+        }
+    }
+
+    /// Defensive timeout: if some peers never arrive, others surface
+    /// `Timeout` rather than blocking forever.
+    #[test]
+    fn timeout_fires_when_peer_hangs() {
+        const N: u32 = 4;
+        let p = Arc::new(Phaser::new(N));
+        // Only 2 of 4 arrive — others "hang".
+        let p1 = p.clone();
+        let h1 = thread::spawn(move || {
+            p1.arrive_and_wait_timeout(Duration::from_millis(50))
+        });
+        let p2 = p.clone();
+        let h2 = thread::spawn(move || {
+            p2.arrive_and_wait_timeout(Duration::from_millis(50))
+        });
+        assert_eq!(h1.join().unwrap(), PhaserOutcome::Timeout);
+        assert_eq!(h2.join().unwrap(), PhaserOutcome::Timeout);
+    }
+
+    /// Multi-phase stress: all participants run a tight loop of
+    /// arrive_and_wait calls; after K phases they all observe the same
+    /// `current_phase()` value. Catches generation/counter resync bugs.
+    #[test]
+    fn multi_phase_progress() {
+        const N: u32 = 6;
+        const K: u32 = 1000;
+        let p = Arc::new(Phaser::new(N));
+        let counter = Arc::new(AtomicU32::new(0));
+        let mut handles = Vec::new();
+        for i in 0..N {
+            let p = p.clone();
+            let c = counter.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-multi-{i}"))
+                    .spawn(move || {
+                        for _ in 0..K {
+                            assert_eq!(
+                                p.arrive_and_wait(i as u8),
+                                PhaserOutcome::Advanced
+                            );
+                        }
+                        c.fetch_add(1, Ordering::Relaxed);
+                    })
+                    .unwrap(),
+            );
+        }
+        for h in handles {
+            h.join().unwrap();
+        }
+        assert_eq!(counter.load(Ordering::Relaxed), N);
+        assert_eq!(p.current_phase(), K);
+    }
+
+    /// Mixed skip/arrive across phases — emulates the realistic scheduler
+    /// pattern where slots become idle for some quanta.
+    #[test]
+    fn mixed_skip_and_arrive_random() {
+        const N: u32 = 6;
+        const K: u32 = 200;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        for i in 0..N {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-mixed-{i}"))
+                    .spawn(move || {
+                        // Pseudo-random skip pattern based on slot+phase
+                        let mut state: u32 = 0x9E37_79B9u32.wrapping_add(i);
+                        for phase in 0..K {
+                            state = state.wrapping_mul(0x6C8E_9CF7).wrapping_add(phase);
+                            if state & 0xF == 0 {
+                                p.skip(i as u8);
+                            } else {
+                                let _ = p.arrive_and_wait(i as u8);
+                            }
+                        }
+                    })
+                    .unwrap(),
+            );
+        }
+        for h in handles {
+            h.join().unwrap();
+        }
+        // After K rounds with all-N participation each phase, the phase
+        // counter equals K. Each iteration contributes exactly N to the
+        // counter (split between arrive and skip).
+        assert_eq!(p.current_phase(), K);
+    }
+}
--- a/crates/xenia-cpu/src/reservation.rs
+++ b/crates/xenia-cpu/src/reservation.rs
@@ -0,0 +1,424 @@
+//! Inter-thread reservation table for `lwarx`/`stwcx.` and
+//! `ldarx`/`stdcx.`.
+//!
+//! On real Xenon, each core's `lwarx` places a reservation on a 128-byte
+//! cache line; any other CPU's store to the line invalidates the
+//! reservation. `stwcx.`'s success depends on the reservation still being
+//! valid. Under M3's per-HW-thread parallelism, we need an inter-thread
+//! mechanism for the same guarantee.
+//!
+//! M2 introduces the table behind a runtime `reservations_enabled` flag
+//! (default `false`). When the flag is `false`, the interpreter's
+//! existing per-`PpcContext` `reserved_line`/`has_reservation` fields are
+//! used as-is — no inter-thread tracking. M3 flips the flag on once the
+//! per-HW-thread host threads are spawning.
+//!
+//! ## Design
+//!
+//! - **Banked AtomicU64 array** of [`NUM_LINES`] entries (4096 × 8 B =
+//!   32 KiB total). Each entry packs `(line_address, generation,
+//!   hw_id)`. A zero value means "no reservation on this bank".
+//! - **Hash function**: `(line >> 7) & (NUM_LINES - 1)`. Different lines
+//!   that map to the same bank conservatively invalidate each other's
+//!   reservations — sound (real Xenon's L2 has finite associativity and
+//!   has the same property), at the cost of slightly more `stwcx.`
+//!   failures than a perfect-mapping table would produce.
+//! - **`active_reservers: AtomicU16`** — a fast-path counter
+//!   incremented by every `lwarx` and decremented when its reservation is
+//!   either committed or invalidated. `write_u32` checks this with a
+//!   single `Relaxed` load; when zero (the common case in code that
+//!   doesn't use atomics), the invalidation hook is a one-instruction
+//!   skip.
+//! - **Generation counter**: monotonic across all reservations,
+//!   incremented atomically. 24 bits of generation packed in the slot
+//!   means 16 M reuses per slot before wraparound; at multi-million
+//!   reservations/sec sustained that's still many seconds, and a
+//!   stale-gen `stwcx.` simply fails (sound, not livelocking).
+//!
+//! ## Invariants
+//!
+//! 1. A `stwcx.(addr)` succeeds only if the line slot still holds the
+//!    same `(line, gen, hw_id)` triple the reserver stamped at `lwarx`.
+//! 2. Any plain store to a reserved line invalidates it (slot CASed to
+//!    zero). Hash-collision side-effect: a store to a different line
+//!    that maps to the same bank also invalidates — guests that observe
+//!    a `stwcx.` failure simply retry, so this is correctness-preserving.
+//! 3. `stwcx.` from a different `hw_id` than the reserver fails even if
+//!    the line and gen would otherwise match — only the originating HW
+//!    thread can commit its own reservation.
+//!
+//! Memory ordering: all CAS / store operations on the line slot use
+//! `AcqRel`; readers use `Acquire`. The store inside `stwcx.`'s payload
+//! itself (the actual data write) is the caller's responsibility — see
+//! [`crate::interpreter`]'s `stwcx.` arm.
+
+use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
+
+/// Real Xenon L2 cache-line size — the granule a reservation covers.
+pub const LINE_BYTES: u32 = 0x80;
+/// Mask to align an address to a cache-line boundary.
+pub const LINE_MASK: u32 = !(LINE_BYTES - 1);
+/// Number of bank entries in the reservation table. Power of two so the
+/// hash is a single AND. 32 KiB total at 8 B per entry.
+pub const NUM_LINES: usize = 4096;
+const HASH_MASK: u32 = (NUM_LINES as u32) - 1;
+
+/// Pack `(line_addr, generation, hw_id)` into a single u64. The packed
+/// layout is:
+///   bits 63..32: line address (we only need the high bits since the
+///                low 7 are always zero — reserved range is line-aligned)
+///   bits 31..8:  24-bit generation
+///   bits 7..0:   8-bit `hw_id`
+///
+/// A packed value of `0` means "no reservation". Since we never reserve
+/// on guest virtual address `0` (the page is unmapped) and the
+/// generation increments from `1`, zero is a safe sentinel.
+#[inline]
+pub fn pack(line_addr: u32, generation: u32, hw_id: u8) -> u64 {
+    debug_assert!(line_addr & !LINE_MASK == 0, "line_addr must be line-aligned");
+    debug_assert!(generation < (1 << 24), "generation must fit in 24 bits");
+    ((line_addr as u64) << 32)
+        | ((generation as u64 & 0xFF_FFFF) << 8)
+        | (hw_id as u64)
+}
+
+/// Inverse of [`pack`]. Returns `None` if the value is the zero sentinel
+/// (no reservation).
+#[inline]
+pub fn unpack(raw: u64) -> Option<(u32, u32, u8)> {
+    if raw == 0 {
+        return None;
+    }
+    let line = (raw >> 32) as u32;
+    let generation = ((raw >> 8) & 0xFF_FFFF) as u32;
+    let hw_id = (raw & 0xFF) as u8;
+    Some((line, generation, hw_id))
+}
+
+#[inline]
+fn hash(line_addr: u32) -> usize {
+    ((line_addr >> 7) & HASH_MASK) as usize
+}
+
+#[inline]
+fn align_to_line(addr: u32) -> u32 {
+    addr & LINE_MASK
+}
+
+/// Banked reservation table shared across all emulated HW threads. Built
+/// once per emulation instance; lives behind an `Arc` so worker host
+/// threads (M3) can hold their own clones without lifetime gymnastics.
+pub struct ReservationTable {
+    lines: Vec<AtomicU64>,
+    active_reservers: AtomicU16,
+    next_gen: AtomicU64,
+    /// Runtime activation flag. Default `false`. M2.8's
+    /// `--reservations-table` flag (or M3 spawn) flips this to `true`,
+    /// at which point the interpreter's `lwarx`/`stwcx.` arms route
+    /// through the table; otherwise they use the legacy per-`PpcContext`
+    /// reservation fields.
+    enabled: std::sync::atomic::AtomicBool,
+}
+
+impl Default for ReservationTable {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReservationTable {
+    /// Construct a fresh table with all banks empty.
+    pub fn new() -> Self {
+        let mut lines = Vec::with_capacity(NUM_LINES);
+        for _ in 0..NUM_LINES {
+            lines.push(AtomicU64::new(0));
+        }
+        Self {
+            lines,
+            active_reservers: AtomicU16::new(0),
+            // Start at 1 so the very first reservation gets a non-zero
+            // gen and the packed slot value is non-zero (zero is the
+            // "no reservation" sentinel).
+            next_gen: AtomicU64::new(1),
+            enabled: std::sync::atomic::AtomicBool::new(false),
+        }
+    }
+
+    /// Activate the table. The interpreter's `lwarx`/`stwcx.` arms will
+    /// route through this table on subsequent dispatches. Idempotent.
+    pub fn enable(&self) {
+        self.enabled
+            .store(true, std::sync::atomic::Ordering::Release);
+    }
+
+    /// Deactivate the table. The interpreter falls back to per-`PpcContext`
+    /// reservation fields. Idempotent.
+    pub fn disable(&self) {
+        self.enabled
+            .store(false, std::sync::atomic::Ordering::Release);
+    }
+
+    /// Whether the table is currently active. The interpreter consults
+    /// this on every `lwarx`/`stwcx.` to decide which path runs.
+    pub fn is_enabled(&self) -> bool {
+        self.enabled.load(std::sync::atomic::Ordering::Acquire)
+    }
+
+    /// True when at least one reservation is currently outstanding.
+    /// Plain `write_u32` consults this to skip the invalidation hook
+    /// when no thread holds a reservation — the common case for
+    /// non-atomic code.
+    #[inline]
+    pub fn has_active_reservers(&self) -> bool {
+        self.active_reservers.load(Ordering::Relaxed) > 0
+    }
+
+    /// `lwarx(addr)` — claim a reservation on the line containing `addr`.
+    /// Returns the generation stamped into the slot; the interpreter
+    /// stores this alongside the per-`PpcContext` `has_reservation` bit
+    /// so a subsequent `stwcx.` can verify the same gen still holds.
+    ///
+    /// If a different reservation already occupied the bank, it's
+    /// silently overwritten — that thread's `stwcx.` will fail because
+    /// the slot no longer matches its stamped gen. Matches Xenon
+    /// behavior (a different core's lwarx on the same line displaces
+    /// any prior reservation).
+    pub fn reserve(&self, addr: u32, hw_id: u8) -> u32 {
+        let line = align_to_line(addr);
+        let generation = (self
+            .next_gen
+            .fetch_add(1, Ordering::Relaxed)
+            & 0xFF_FFFF) as u32;
+        let new_raw = pack(line, generation, hw_id);
+        // Release: prior reads of the reservation target should
+        // happen-before any thread that observes the new slot value.
+        let prev = self.lines[hash(line)].swap(new_raw, Ordering::AcqRel);
+        // If the previous slot was non-zero, the displaced reserver is
+        // implicitly invalidated — decrement the active counter for it.
+        // Else, increment for our new reservation. Net effect: the
+        // counter equals the number of *bank slots* with a non-zero
+        // value, which is an upper bound on actual reservers.
+        if prev == 0 {
+            self.active_reservers.fetch_add(1, Ordering::Relaxed);
+        }
+        generation
+    }
+
+    /// `stwcx.(addr)` — try to commit a reservation. Returns `true` if
+    /// the slot still holds `(line, my_gen, my_hw_id)` (in which case
+    /// it's CAS'd back to zero, releasing the bank), `false` otherwise.
+    /// The data store itself is the caller's responsibility — see
+    /// [`crate::interpreter`]'s `stwcx.` arm.
+    pub fn try_commit(&self, addr: u32, my_gen: u32, my_hw_id: u8) -> bool {
+        let line = align_to_line(addr);
+        let expected = pack(line, my_gen, my_hw_id);
+        match self.lines[hash(line)].compare_exchange(
+            expected,
+            0,
+            Ordering::AcqRel,
+            Ordering::Relaxed,
+        ) {
+            Ok(_) => {
+                // Successfully released the slot; decrement the active
+                // count.
+                self.active_reservers.fetch_sub(1, Ordering::Relaxed);
+                true
+            }
+            Err(_) => false,
+        }
+    }
+
+    /// Hook for plain (non-reserving) stores: invalidate any
+    /// reservation on the containing line. Cheap when the bank is
+    /// already empty (single Acquire load + branch).
+    pub fn invalidate_for_write(&self, addr: u32) {
+        let line = align_to_line(addr);
+        let bank = &self.lines[hash(line)];
+        let prev = bank.load(Ordering::Acquire);
+        if prev == 0 {
+            return;
+        }
+        // Verify the slot still holds a reservation on *this* line
+        // before clearing — hash collisions mean the bank may hold a
+        // reservation on an unrelated line that maps to the same slot.
+        // Real Xenon has the same property (limited L2 associativity);
+        // we mirror it here. A spurious bank match invalidates a
+        // different line's reservation; the affected `stwcx.` retries —
+        // sound, slightly less efficient.
+        if let Some((bank_line, _generation, _hw)) = unpack(prev) {
+            if bank_line != line {
+                // Different line in the same bank — leave it alone (we
+                // chose not to invalidate cross-line collisions to
+                // reduce false-fail noise; real-HW behavior is similar
+                // since L2 associativity sets cross-line constraints).
+                return;
+            }
+        }
+        // CAS-clear the bank if it still holds the value we observed.
+        // If a concurrent `stwcx.` or `reserve` raced with us, the CAS
+        // fails — that's fine; the line slot is now in a different
+        // state and the displaced reservation will be picked up there.
+        if bank
+            .compare_exchange(prev, 0, Ordering::AcqRel, Ordering::Relaxed)
+            .is_ok()
+        {
+            self.active_reservers.fetch_sub(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Drop a per-`PpcContext` reservation without committing. Called
+    /// when the interpreter clears `has_reservation` due to a
+    /// non-`stwcx.` event (context switch, exception, etc.). Safe to
+    /// call when the table doesn't hold our reservation anymore (the
+    /// CAS simply fails).
+    pub fn release(&self, addr: u32, my_gen: u32, my_hw_id: u8) {
+        let _ = self.try_commit(addr, my_gen, my_hw_id);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::thread;
+
+    #[test]
+    fn pack_unpack_roundtrip() {
+        let raw = pack(0x1000_0000, 42, 5);
+        let (line, generation, hw) = unpack(raw).unwrap();
+        assert_eq!(line, 0x1000_0000);
+        assert_eq!(generation, 42);
+        assert_eq!(hw, 5);
+    }
+
+    #[test]
+    fn unpack_zero_is_none() {
+        assert!(unpack(0).is_none());
+    }
+
+    #[test]
+    fn reserve_then_commit_succeeds() {
+        let t = ReservationTable::new();
+        let gn = t.reserve(0x1234, 0);
+        assert!(t.try_commit(0x1234, gn, 0));
+        // Already released — second commit fails.
+        assert!(!t.try_commit(0x1234, gn, 0));
+    }
+
+    #[test]
+    fn other_hw_id_cannot_commit() {
+        let t = ReservationTable::new();
+        let gn = t.reserve(0x1234, 0);
+        assert!(
+            !t.try_commit(0x1234, gn, 1),
+            "stwcx. from a different hw_id must fail"
+        );
+        // Original owner can still commit.
+        assert!(t.try_commit(0x1234, gn, 0));
+    }
+
+    #[test]
+    fn lwarx_displaces_prior_reservation() {
+        let t = ReservationTable::new();
+        let g0 = t.reserve(0x1234, 0);
+        // Different HW thread's lwarx on the same line.
+        let g1 = t.reserve(0x1234, 1);
+        // Original reserver's stwcx. fails because the gen changed.
+        assert!(!t.try_commit(0x1234, g0, 0));
+        // New reserver's stwcx. succeeds.
+        assert!(t.try_commit(0x1234, g1, 1));
+    }
+
+    #[test]
+    fn invalidate_clears_matching_reservation() {
+        let t = ReservationTable::new();
+        let gn = t.reserve(0x1234, 0);
+        t.invalidate_for_write(0x1238); // same line as 0x1234
+        assert!(!t.try_commit(0x1234, gn, 0));
+        assert_eq!(t.active_reservers.load(Ordering::Relaxed), 0);
+    }
+
+    #[test]
+    fn invalidate_different_line_in_same_bank_is_noop() {
+        let t = ReservationTable::new();
+        // Force a hash collision: addr A and addr B with same hash but
+        // different line addresses.
+        let line_a = 0x0000_1000;
+        let line_b = line_a + ((NUM_LINES as u32) << 7); // +0x80000 → same hash
+        assert_eq!(hash(line_a), hash(line_b));
+        let gn = t.reserve(line_a, 0);
+        // Invalidating line_b must NOT clear line_a's reservation.
+        t.invalidate_for_write(line_b);
+        assert!(t.try_commit(line_a, gn, 0));
+    }
+
+    #[test]
+    fn has_active_reservers_tracks_count() {
+        let t = ReservationTable::new();
+        assert!(!t.has_active_reservers());
+        let g0 = t.reserve(0x1000, 0);
+        assert!(t.has_active_reservers());
+        let g1 = t.reserve(0x2000, 1);
+        assert!(t.has_active_reservers());
+        t.try_commit(0x1000, g0, 0);
+        assert!(t.has_active_reservers());
+        t.try_commit(0x2000, g1, 1);
+        assert!(!t.has_active_reservers());
+    }
+
+    /// Stress test: 8 host threads each loop reserve+stwcx on the same
+    /// line. Exactly one stwcx per round can win; the others fail and
+    /// retry. The total number of *successful* commits across N
+    /// outer iterations equals N (one winner per round).
+    ///
+    /// This proves the table's mutual-exclusion property: at most one
+    /// thread's stwcx. on a given line can succeed between two events
+    /// that would invalidate the line.
+    #[test]
+    fn concurrent_lwarx_stwcx_serializes() {
+        let t = Arc::new(ReservationTable::new());
+        const ROUNDS: u32 = 1000;
+        const THREADS: u8 = 8;
+        let total_successes = Arc::new(AtomicU64::new(0));
+
+        let mut handles = Vec::new();
+        for hw_id in 0..THREADS {
+            let t_clone = t.clone();
+            let s_clone = total_successes.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("res-stress-{hw_id}"))
+                    .spawn(move || {
+                        let mut wins = 0u64;
+                        for _ in 0..ROUNDS {
+                            let gn = t_clone.reserve(0x1234_5678, hw_id);
+                            if t_clone.try_commit(0x1234_5678, gn, hw_id) {
+                                wins += 1;
+                            }
+                        }
+                        s_clone.fetch_add(wins, Ordering::Relaxed);
+                    })
+                    .expect("spawn"),
+            );
+        }
+        for h in handles {
+            h.join().expect("join");
+        }
+        let total = total_successes.load(Ordering::Relaxed);
+        // Lower bound: every round had at least one winner — but races
+        // can cause some rounds to have zero (all threads' reservations
+        // got displaced before any could commit). Assert progress: at
+        // least 10% of attempts succeed, and active_reservers is back
+        // to zero.
+        let attempts = ROUNDS as u64 * THREADS as u64;
+        assert!(
+            total > attempts / 10,
+            "expected at least 10% successful commits, got {total}/{attempts}"
+        );
+        assert_eq!(
+            t.active_reservers.load(Ordering::Relaxed),
+            0,
+            "all reservations should have been resolved"
+        );
+    }
+}
--- a/crates/xenia-cpu/src/scheduler.rs
+++ b/crates/xenia-cpu/src/scheduler.rs
--- a/crates/xenia-cpu/src/trap.rs
+++ b/crates/xenia-cpu/src/trap.rs
@@ -0,0 +1,95 @@
+//! TO-field evaluation for `tw`, `twi`, `td`, `tdi`.
+//!
+//! The TO field (5 bits) encodes which comparison outcomes trigger a trap:
+//!
+//! | bit | condition |
+//! |-----|-----------|
+//! | 0   | a <  b (signed)   |
+//! | 1   | a >  b (signed)   |
+//! | 2   | a == b            |
+//! | 3   | a <  b (unsigned) |
+//! | 4   | a >  b (unsigned) |
+//!
+//! The bit numbering matches PowerISA ("MSB is bit 0"): TO[0] corresponds to
+//! the high bit of the 5-bit field, i.e. (to >> 4) & 1.
+//!
+//! `tw` / `twi` compare the low 32 bits of the operands (sign-extended back to
+//! 64 for the signed comparison); `td` / `tdi` compare the full 64 bits.
+
+#[derive(Clone, Copy, Debug)]
+pub enum TrapWidth {
+    Word,        // tw, twi: 32-bit
+    Doubleword,  // td, tdi: 64-bit
+}
+
+const TO_SLT: u32 = 1 << 4; // a < b  signed
+const TO_SGT: u32 = 1 << 3; // a > b  signed
+const TO_EQ:  u32 = 1 << 2; // a == b
+const TO_ULT: u32 = 1 << 1; // a < b  unsigned
+const TO_UGT: u32 = 1 << 0; // a > b  unsigned
+
+/// Returns true when the trap should fire.
+pub fn evaluate(to: u32, a: u64, b: u64, width: TrapWidth) -> bool {
+    let (sa, sb, ua, ub): (i64, i64, u64, u64) = match width {
+        TrapWidth::Word => (
+            a as i32 as i64,
+            b as i32 as i64,
+            a as u32 as u64,
+            b as u32 as u64,
+        ),
+        TrapWidth::Doubleword => (a as i64, b as i64, a, b),
+    };
+
+    if (to & TO_SLT) != 0 && sa <  sb { return true; }
+    if (to & TO_SGT) != 0 && sa >  sb { return true; }
+    if (to & TO_EQ)  != 0 && ua == ub { return true; }
+    if (to & TO_ULT) != 0 && ua <  ub { return true; }
+    if (to & TO_UGT) != 0 && ua >  ub { return true; }
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn to_zero_never_traps() {
+        assert!(!evaluate(0, 0, 0, TrapWidth::Doubleword));
+        assert!(!evaluate(0, 5, 3, TrapWidth::Doubleword));
+        assert!(!evaluate(0, !0, 0, TrapWidth::Doubleword));
+    }
+
+    #[test]
+    fn to_31_always_traps_when_any_condition_holds() {
+        // 31 = 0b11111 = all conditions enabled
+        assert!(evaluate(31, 1, 2, TrapWidth::Doubleword)); // slt+ult
+        assert!(evaluate(31, 2, 1, TrapWidth::Doubleword)); // sgt+ugt
+        assert!(evaluate(31, 7, 7, TrapWidth::Doubleword)); // eq
+    }
+
+    #[test]
+    fn to_eq_only() {
+        // TO[2] = 0b00100 = 4
+        assert!(evaluate(4, 5, 5, TrapWidth::Doubleword));
+        assert!(!evaluate(4, 5, 6, TrapWidth::Doubleword));
+    }
+
+    #[test]
+    fn to_signed_vs_unsigned_on_negative() {
+        // a=-1 (as u64 = all-ones). TO[0]=slt enabled = 0b10000 = 16
+        // Signed: -1 < 0  → true
+        let neg1 = (-1i64) as u64;
+        assert!(evaluate(16, neg1, 0, TrapWidth::Doubleword));
+        // TO[3]=ult enabled = 0b00010 = 2 → unsigned: all-ones < 0 is false
+        assert!(!evaluate(2, neg1, 0, TrapWidth::Doubleword));
+    }
+
+    #[test]
+    fn word_width_ignores_high_32_bits() {
+        // a's low 32 = 1, high 32 = different; b = 1. With TO=eq, should trap.
+        let a = 0xDEAD_BEEF_0000_0001u64;
+        assert!(evaluate(4, a, 1, TrapWidth::Word));
+        // In doubleword, different.
+        assert!(!evaluate(4, a, 1, TrapWidth::Doubleword));
+    }
+}
--- a/crates/xenia-cpu/src/vmx.rs
+++ b/crates/xenia-cpu/src/vmx.rs
@@ -0,0 +1,918 @@
+//! VMX / AltiVec helper routines shared by the interpreter's 150+ vector
+//! opcode handlers.
+//!
+//! Big-endian lane indexing throughout: `Vec128::bytes[0]` is the most
+//! significant byte, which corresponds to PowerPC lane 0. Operations that
+//! care about "even" vs "odd" lanes follow the PPC convention (lane 0 = most
+//! significant = "even" for multiply-even/odd purposes).
+
+use xenia_memory::MemoryAccess;
+use xenia_types::Vec128;
+
+// ─── Lane accessors ────────────────────────────────────────────────────────
+
+#[inline] pub fn as_i8x16(v: Vec128) -> [i8; 16] {
+    let b = v.as_bytes();
+    let mut r = [0i8; 16];
+    for i in 0..16 { r[i] = b[i] as i8; }
+    r
+}
+
+#[inline] pub fn as_i16x8(v: Vec128) -> [i16; 8] {
+    let u = v.as_u16x8();
+    [u[0] as i16, u[1] as i16, u[2] as i16, u[3] as i16,
+     u[4] as i16, u[5] as i16, u[6] as i16, u[7] as i16]
+}
+
+#[inline] pub fn as_i32x4(v: Vec128) -> [i32; 4] {
+    let u = v.as_u32x4();
+    [u[0] as i32, u[1] as i32, u[2] as i32, u[3] as i32]
+}
+
+#[inline] pub fn from_i8x16(r: [i8; 16]) -> Vec128 {
+    let mut b = [0u8; 16];
+    for i in 0..16 { b[i] = r[i] as u8; }
+    Vec128::from_bytes(b)
+}
+
+#[inline] pub fn from_i16x8(r: [i16; 8]) -> Vec128 {
+    Vec128::from_u16x8_array([
+        r[0] as u16, r[1] as u16, r[2] as u16, r[3] as u16,
+        r[4] as u16, r[5] as u16, r[6] as u16, r[7] as u16,
+    ])
+}
+
+#[inline] pub fn from_i32x4(r: [i32; 4]) -> Vec128 {
+    Vec128::from_u32x4_array([r[0] as u32, r[1] as u32, r[2] as u32, r[3] as u32])
+}
+
+// ─── Saturation helpers ────────────────────────────────────────────────────
+// Each returns (clamped_value, saturated_flag). Handlers OR the flags together
+// and call `ctx.set_vscr_sat(true)` once per instruction.
+
+#[inline] pub fn sat_add_u8(a: u8, b: u8) -> (u8, bool) {
+    let s = a as u16 + b as u16;
+    if s > u8::MAX as u16 { (u8::MAX, true) } else { (s as u8, false) }
+}
+#[inline] pub fn sat_sub_u8(a: u8, b: u8) -> (u8, bool) {
+    if a >= b { (a - b, false) } else { (0, true) }
+}
+#[inline] pub fn sat_add_i8(a: i8, b: i8) -> (i8, bool) {
+    let s = a as i16 + b as i16;
+    if s > i8::MAX as i16 { (i8::MAX, true) }
+    else if s < i8::MIN as i16 { (i8::MIN, true) }
+    else { (s as i8, false) }
+}
+#[inline] pub fn sat_sub_i8(a: i8, b: i8) -> (i8, bool) {
+    let s = a as i16 - b as i16;
+    if s > i8::MAX as i16 { (i8::MAX, true) }
+    else if s < i8::MIN as i16 { (i8::MIN, true) }
+    else { (s as i8, false) }
+}
+
+#[inline] pub fn sat_add_u16(a: u16, b: u16) -> (u16, bool) {
+    let s = a as u32 + b as u32;
+    if s > u16::MAX as u32 { (u16::MAX, true) } else { (s as u16, false) }
+}
+#[inline] pub fn sat_sub_u16(a: u16, b: u16) -> (u16, bool) {
+    if a >= b { (a - b, false) } else { (0, true) }
+}
+#[inline] pub fn sat_add_i16(a: i16, b: i16) -> (i16, bool) {
+    let s = a as i32 + b as i32;
+    if s > i16::MAX as i32 { (i16::MAX, true) }
+    else if s < i16::MIN as i32 { (i16::MIN, true) }
+    else { (s as i16, false) }
+}
+#[inline] pub fn sat_sub_i16(a: i16, b: i16) -> (i16, bool) {
+    let s = a as i32 - b as i32;
+    if s > i16::MAX as i32 { (i16::MAX, true) }
+    else if s < i16::MIN as i32 { (i16::MIN, true) }
+    else { (s as i16, false) }
+}
+
+#[inline] pub fn sat_add_u32(a: u32, b: u32) -> (u32, bool) {
+    let s = a as u64 + b as u64;
+    if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }
+}
+#[inline] pub fn sat_sub_u32(a: u32, b: u32) -> (u32, bool) {
+    if a >= b { (a - b, false) } else { (0, true) }
+}
+#[inline] pub fn sat_add_i32(a: i32, b: i32) -> (i32, bool) {
+    let s = a as i64 + b as i64;
+    if s > i32::MAX as i64 { (i32::MAX, true) }
+    else if s < i32::MIN as i64 { (i32::MIN, true) }
+    else { (s as i32, false) }
+}
+#[inline] pub fn sat_sub_i32(a: i32, b: i32) -> (i32, bool) {
+    let s = a as i64 - b as i64;
+    if s > i32::MAX as i64 { (i32::MAX, true) }
+    else if s < i32::MIN as i64 { (i32::MIN, true) }
+    else { (s as i32, false) }
+}
+
+// Pack-with-saturation helpers — clamp a wider integer to the narrower type.
+#[inline] pub fn sat_i16_to_i8(v: i16) -> (i8, bool) {
+    if v > i8::MAX as i16 { (i8::MAX, true) }
+    else if v < i8::MIN as i16 { (i8::MIN, true) }
+    else { (v as i8, false) }
+}
+#[inline] pub fn sat_i16_to_u8(v: i16) -> (u8, bool) {
+    if v < 0 { (0, true) }
+    else if v > u8::MAX as i16 { (u8::MAX, true) }
+    else { (v as u8, false) }
+}
+#[inline] pub fn sat_u16_to_u8(v: u16) -> (u8, bool) {
+    if v > u8::MAX as u16 { (u8::MAX, true) } else { (v as u8, false) }
+}
+#[inline] pub fn sat_i32_to_i16(v: i32) -> (i16, bool) {
+    if v > i16::MAX as i32 { (i16::MAX, true) }
+    else if v < i16::MIN as i32 { (i16::MIN, true) }
+    else { (v as i16, false) }
+}
+#[inline] pub fn sat_i32_to_u16(v: i32) -> (u16, bool) {
+    if v < 0 { (0, true) }
+    else if v > u16::MAX as i32 { (u16::MAX, true) }
+    else { (v as u16, false) }
+}
+#[inline] pub fn sat_u32_to_u16(v: u32) -> (u16, bool) {
+    if v > u16::MAX as u32 { (u16::MAX, true) } else { (v as u16, false) }
+}
+#[inline] pub fn sat_i64_to_i32(v: i64) -> (i32, bool) {
+    if v > i32::MAX as i64 { (i32::MAX, true) }
+    else if v < i32::MIN as i64 { (i32::MIN, true) }
+    else { (v as i32, false) }
+}
+#[inline] pub fn sat_i64_to_u32(v: i64) -> (u32, bool) {
+    if v < 0 { (0, true) }
+    else if v > u32::MAX as i64 { (u32::MAX, true) }
+    else { (v as u32, false) }
+}
+
+// ─── Averages ──────────────────────────────────────────────────────────────
+// PPC avg is rounded up: (a + b + 1) / 2.
+#[inline] pub fn avg_u8(a: u8, b: u8) -> u8 {
+    ((a as u16 + b as u16 + 1) >> 1) as u8
+}
+#[inline] pub fn avg_u16(a: u16, b: u16) -> u16 {
+    ((a as u32 + b as u32 + 1) >> 1) as u16
+}
+#[inline] pub fn avg_u32(a: u32, b: u32) -> u32 {
+    ((a as u64 + b as u64 + 1) >> 1) as u32
+}
+#[inline] pub fn avg_i8(a: i8, b: i8) -> i8 {
+    ((a as i32 + b as i32 + 1) >> 1) as i8
+}
+#[inline] pub fn avg_i16(a: i16, b: i16) -> i16 {
+    ((a as i32 + b as i32 + 1) >> 1) as i16
+}
+#[inline] pub fn avg_i32(a: i32, b: i32) -> i32 {
+    ((a as i64 + b as i64 + 1) >> 1) as i32
+}
+
+// ─── NaN-aware f32 min/max for vmaxfp / vminfp ────────────────────────────
+//
+// Altivec PEM: "If either element of vA or vB is a NaN, the corresponding
+// element of vD is set to the quiet NaN form of that NaN". Rust's `>` / `<`
+// comparison with NaN always returns false, so `if a > b { a } else { b }`
+// would silently pick `b` whenever `a` is NaN — losing NaN propagation.
+
+#[inline]
+pub fn max_nan(a: f32, b: f32) -> f32 {
+    if a.is_nan() { quiet_nan(a) }
+    else if b.is_nan() { quiet_nan(b) }
+    else if a > b { a } else { b }
+}
+
+#[inline]
+pub fn min_nan(a: f32, b: f32) -> f32 {
+    if a.is_nan() { quiet_nan(a) }
+    else if b.is_nan() { quiet_nan(b) }
+    else if a < b { a } else { b }
+}
+
+/// Convert an SNaN to QNaN by setting the high mantissa bit. A QNaN is
+/// returned unchanged.
+#[inline]
+pub fn quiet_nan(x: f32) -> f32 {
+    if !x.is_nan() { return x; }
+    f32::from_bits(x.to_bits() | 0x0040_0000)
+}
+
+/// Flush a subnormal f32 to ±0 (preserving the sign). Used by vmaddfp family,
+/// vctsxs / vctuxs, and any instruction whose AltiVec definition specifies
+/// input-side denormal flushing regardless of VSCR[NJ].
+#[inline]
+pub fn flush_denorm(x: f32) -> f32 {
+    if x.is_subnormal() {
+        if x.is_sign_negative() { -0.0 } else { 0.0 }
+    } else {
+        x
+    }
+}
+
+// ─── Float ⇄ fixed-point conversions (scaled by 2^scale_bits) ─────────────
+//
+// vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
+#[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
+    if x.is_nan() { return (0, true); }
+    let x = flush_denorm(x);
+    let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
+    if scaled >= i32::MAX as f64 { return (i32::MAX, true); }
+    if scaled <= i32::MIN as f64 { return (i32::MIN, true); }
+    (scaled.trunc() as i32, false)
+}
+#[inline] pub fn cvt_f32_to_u32_sat(x: f32, scale_bits: u32) -> (u32, bool) {
+    if x.is_nan() { return (0, true); }
+    let x = flush_denorm(x);
+    let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
+    if scaled < 0.0 { return (0, true); }
+    if scaled > u32::MAX as f64 { return (u32::MAX, true); }
+    (scaled.trunc() as u32, false)
+}
+#[inline] pub fn cvt_i32_to_f32(v: i32, scale_bits: u32) -> f32 {
+    (v as f64 / (1u64 << scale_bits) as f64) as f32
+}
+#[inline] pub fn cvt_u32_to_f32(v: u32, scale_bits: u32) -> f32 {
+    (v as f64 / (1u64 << scale_bits) as f64) as f32
+}
+
+// ─── Unaligned vector load/store ──────────────────────────────────────────
+//
+// lvlx/lvrx and stvlx/stvrx combine to perform any unaligned 16-byte access:
+//   lvlx(EA) | lvrx(EA + 16)   loads 16 bytes starting at unaligned EA.
+//   stvlx(EA); stvrx(EA + 16)   stores 16 bytes starting at unaligned EA.
+//
+// Semantics per the AltiVec manual (and xenia-canary ppc_emit_memory.cc):
+//   lvlx: shift = EA & 0xF, n = 16 - shift. Loads mem[EA..EA+n] into
+//         lanes VR[0..n], zeros VR[n..16].
+//   lvrx: shift = EA & 0xF. If shift == 0, VR = 0. Otherwise loads
+//         mem[EA-shift..EA] into lanes VR[16-shift..16], zeros VR[0..16-shift].
+//   stvlx / stvrx are the symmetric stores.
+//
+// `Vec128::bytes[0]` is the most significant byte (PPC lane 0 in BE view).
+
+pub fn load_vector_left(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
+    let shift = (ea & 0xF) as usize;
+    let n = 16 - shift;
+    let mut bytes = [0u8; 16];
+    for i in 0..n {
+        bytes[i] = mem.read_u8(ea.wrapping_add(i as u32));
+    }
+    Vec128::from_bytes(bytes)
+}
+
+pub fn load_vector_right(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
+    let shift = (ea & 0xF) as usize;
+    if shift == 0 { return Vec128::ZERO; }
+    let base = ea & !0xFu32;
+    let mut bytes = [0u8; 16];
+    for i in 0..shift {
+        bytes[16 - shift + i] = mem.read_u8(base.wrapping_add(i as u32));
+    }
+    Vec128::from_bytes(bytes)
+}
+
+pub fn store_vector_left(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
+    let shift = (ea & 0xF) as usize;
+    let n = 16 - shift;
+    let b = v.as_bytes();
+    for i in 0..n {
+        mem.write_u8(ea.wrapping_add(i as u32), b[i]);
+    }
+}
+
+pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
+    let shift = (ea & 0xF) as usize;
+    if shift == 0 { return; }
+    let base = ea & !0xFu32;
+    let b = v.as_bytes();
+    for i in 0..shift {
+        mem.write_u8(base.wrapping_add(i as u32), b[16 - shift + i]);
+    }
+}
+
+// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ─────────────────────────
+// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel.
+// vupkhpx / vupklpx reverse the operation.
+//
+// Format: input 32-bit word holds
+//     bits 0-6: unused (0)
+//     bit 7:    alpha-select (→ bit 15 of output)
+//     bits 8-15:  R (top 5 bits kept)
+//     bits 16-23: G (top 5 bits kept)
+//     bits 24-31: B (top 5 bits kept)
+// Output 16-bit word:
+//     bit 15:   A (from input bit 7)
+//     bits 10-14: R
+//     bits 5-9:   G
+//     bits 0-4:   B
+
+#[inline] pub fn pack_pixel_555(input: u32) -> u16 {
+    let a = (input >> 7) & 0x1;
+    let r = (input >> 8) & 0xFF;
+    let g = (input >> 16) & 0xFF;
+    let b = (input >> 24) & 0xFF;
+    ((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16
+}
+
+#[inline] pub fn unpack_pixel_555(input: u16) -> u32 {
+    let input = input as u32;
+    let a = (input >> 15) & 0x1;
+    let r = (input >> 10) & 0x1F;
+    let g = (input >> 5) & 0x1F;
+    let b = input & 0x1F;
+    // Sign-extend A and replicate 5-bit RGB into the top of each byte.
+    let a8 = if a != 0 { 0xFFu32 } else { 0 };
+    let r8 = (r << 3) | (r >> 2);
+    let g8 = (g << 3) | (g >> 2);
+    let b8 = (b << 3) | (b >> 2);
+    (a8 << 24) | (r8 << 16) | (g8 << 8) | b8
+}
+
+// ─── VMX128 D3D pack/unpack dispatch ──────────────────────────────────────
+// `vpkd3d128` / `vupkd3d128` encode a small enum in the instruction word
+// (VX128_4 immediate field). The exact enum lives in canary's
+// ppc_emit_altivec.cc under PACK_TYPE_*; titles usually touch D3DCOLOR
+// (type 0) and a handful of texture-coordinate variants.
+//
+// Rather than risk getting a rarely-used sub-case wrong, we implement the
+// common types and fall back to a warning + pass-through for unknown types.
+// Returning the VB register value unchanged is always preferable to emitting
+// StepResult::Unimplemented because it keeps the interpreter running.
+
+/// Pack-type encoding of `vpkd3d128` / `vupkd3d128`.
+///
+/// The immediate field lives at PPC bits 16-22 (VX128_3/4 IMM, 7 bits).
+/// Canary decodes `type = IMM >> 2` (top 5 bits) and `pack = IMM & 0x3`
+/// (low 2 bits, used only by `vpkd3d128` to select output-slot layout).
+/// Valid `type` values are 0..=6 per `ppc_emit_altivec.cc:2095-2118`:
+///
+/// | id | canary name       | format                                |
+/// |----|-------------------|---------------------------------------|
+/// |  0 | VPACK_D3DCOLOR    | 4 f32 [0,1] ↔ ARGB8                   |
+/// |  1 | VPACK_NORMSHORT2  | 2 f32 [-1,1] ↔ 2× signed-normalized i16 |
+/// |  2 | VPACK_NORMPACKED32| 4 f32 [-1,1] ↔ UINT_2101010 (w:2,z:10,y:10,x:10) |
+/// |  3 | VPACK_FLOAT16_2   | 2 f32 ↔ 2× fp16                       |
+/// |  4 | VPACK_NORMSHORT4  | 4 f32 [-1,1] ↔ 4× signed-normalized i16 |
+/// |  5 | VPACK_FLOAT16_4   | 4 f32 ↔ 4× fp16                       |
+/// |  6 | VPACK_NORMPACKED64| 4 f32 [-1,1] ↔ ULONG_4202020 (w:4,z:20,y:20,x:20) |
+///
+/// Prior (M3-pre) this enum listed made-up "Normal16"/"Normal8"/"UByteN4"
+/// variants that didn't match canary; the immediate extraction was also
+/// wrong (LSB-numbered `>>6 & 0x7` instead of MSB-numbered `>>11 & 0x1F`
+/// against a 7-bit IMM field). M3 fixes both.
+#[derive(Debug, Clone, Copy)]
+pub enum D3dPackType {
+    D3dColor,
+    NormShort2,
+    NormPacked32,
+    Float16_2,
+    NormShort4,
+    Float16_4,
+    NormPacked64,
+    Other(u32),
+}
+
+impl D3dPackType {
+    /// Decode the `type` bits extracted from the VX128_3/4 IMM field via
+    /// canary's `IMM >> 2` convention (i.e. the caller has already divided
+    /// out the 2-bit `pack` subfield).
+    pub fn from_immediate(type_bits: u32) -> Self {
+        match type_bits {
+            0 => Self::D3dColor,
+            1 => Self::NormShort2,
+            2 => Self::NormPacked32,
+            3 => Self::Float16_2,
+            4 => Self::NormShort4,
+            5 => Self::Float16_4,
+            6 => Self::NormPacked64,
+            other => Self::Other(other),
+        }
+    }
+}
+
+/// Pack an f32x4 vector of [R, G, B, A] in [0.0, 1.0] into a single D3DCOLOR
+/// value in lane 3 of the output.
+pub fn pack_d3dcolor(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let to_byte = |x: f32| -> u32 {
+        let c = x.clamp(0.0, 1.0) * 255.0;
+        (c + 0.5) as u32 & 0xFF
+    };
+    // D3DCOLOR is A,R,G,B in that byte order inside a u32.
+    let word = (to_byte(f[3]) << 24) | (to_byte(f[0]) << 16) | (to_byte(f[1]) << 8) | to_byte(f[2]);
+    Vec128::from_u32x4(0, 0, 0, word)
+}
+
+/// Unpack a D3DCOLOR value (in lane 3 of the input) into an f32x4 [R, G, B, A].
+pub fn unpack_d3dcolor(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    let a = ((word >> 24) & 0xFF) as f32 / 255.0;
+    let r = ((word >> 16) & 0xFF) as f32 / 255.0;
+    let g = ((word >> 8) & 0xFF) as f32 / 255.0;
+    let b = (word & 0xFF) as f32 / 255.0;
+    Vec128::from_f32x4(r, g, b, a)
+}
+
+// ───────────────────────────────────────────────────────────────────────
+// First-Pixels M3 — pack/unpack for the remaining canary pack types.
+//
+// Conventions shared across all helpers:
+//  * Input-to-`unpack_*` (packed data) lives in the *source* lane position
+//    canary's HIR assumes: canonically the 32-bit word is in lane 3 and
+//    the 64-bit value straddles lanes 2-3. We match that so the existing
+//    D3DCOLOR helpers' 3-lane convention is preserved across the whole
+//    pack-type family.
+//  * Output-from-`pack_*` sits in the same lane(s). The caller usually
+//    follows with a permute to move it elsewhere (the VX128_4 `pack`
+//    subfield controls that in `vpkd3d128`).
+//  * Range semantics match canary: normalized types use `max` = (1<<N-1)-1
+//    for signed, clamp before rounding.
+// ───────────────────────────────────────────────────────────────────────
+
+#[inline]
+fn norm_to_i16(x: f32) -> i16 {
+    let c = x.clamp(-1.0, 1.0) * 32767.0;
+    // Round half away from zero, matching canary's `vcfsx` semantics.
+    let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
+    r.clamp(-32768, 32767) as i16
+}
+
+#[inline]
+fn i16_to_norm(s: i16) -> f32 {
+    (s as f32) / 32767.0
+}
+
+/// **NORMSHORT2** — 2 f32s in [-1, 1] → two 16-bit signed-normalized
+/// shorts packed as `(x << 16) | y` in lane 3 (high 32 bits of the word
+/// hold X; low 16 hold Y). Output lanes 0..=2 are zero-filled.
+pub fn pack_normshort2(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = norm_to_i16(f[0]) as u16 as u32;
+    let y = norm_to_i16(f[1]) as u16 as u32;
+    Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
+}
+
+pub fn unpack_normshort2(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    let x = i16_to_norm((word >> 16) as i16);
+    let y = i16_to_norm(word as i16);
+    Vec128::from_f32x4(x, y, 0.0, 1.0)
+}
+
+/// **NORMSHORT4** — 4 f32s in [-1, 1] → four 16-bit signed-normalized
+/// shorts packed across lanes 2-3 (big-endian dword order: X in the
+/// high word of lane 2, Y low of lane 2, Z high of lane 3, W low of lane
+/// 3).
+pub fn pack_normshort4(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = norm_to_i16(f[0]) as u16 as u32;
+    let y = norm_to_i16(f[1]) as u16 as u32;
+    let z = norm_to_i16(f[2]) as u16 as u32;
+    let w = norm_to_i16(f[3]) as u16 as u32;
+    Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
+}
+
+pub fn unpack_normshort4(v: Vec128) -> Vec128 {
+    let hi = v.u32x4(2);
+    let lo = v.u32x4(3);
+    let x = i16_to_norm((hi >> 16) as i16);
+    let y = i16_to_norm(hi as i16);
+    let z = i16_to_norm((lo >> 16) as i16);
+    let w = i16_to_norm(lo as i16);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+/// **NORMPACKED32** — UINT_2101010 layout, 4 f32s in [-1, 1] packed into
+/// 32 bits in lane 3. Per canary's comment `2_10_10_10 w_z_y_x`: the
+/// high 2 bits hold W (signed 2-bit, -2..=1), then Z/Y/X each use 10
+/// signed-normalized bits.
+pub fn pack_normpacked32(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    #[inline]
+    fn n10(x: f32) -> u32 {
+        let c = x.clamp(-1.0, 1.0) * 511.0;
+        let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
+        (r.clamp(-512, 511) as i32 as u32) & 0x3FF
+    }
+    #[inline]
+    fn n2(x: f32) -> u32 {
+        let c = x.clamp(-1.0, 1.0) * 1.0;
+        let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
+        (r.clamp(-2, 1) as i32 as u32) & 0x3
+    }
+    let x = n10(f[0]);
+    let y = n10(f[1]);
+    let z = n10(f[2]);
+    let w = n2(f[3]);
+    let word = (w << 30) | (z << 20) | (y << 10) | x;
+    Vec128::from_u32x4(0, 0, 0, word)
+}
+
+pub fn unpack_normpacked32(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    #[inline]
+    fn u10_to_norm(bits: u32) -> f32 {
+        // Sign-extend the 10-bit field then normalize.
+        let s = ((bits & 0x3FF) as i32) << 22 >> 22;
+        (s as f32) / 511.0
+    }
+    #[inline]
+    fn u2_to_norm(bits: u32) -> f32 {
+        let s = ((bits & 0x3) as i32) << 30 >> 30;
+        (s as f32).clamp(-1.0, 1.0)
+    }
+    let x = u10_to_norm(word);
+    let y = u10_to_norm(word >> 10);
+    let z = u10_to_norm(word >> 20);
+    let w = u2_to_norm(word >> 30);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+/// **NORMPACKED64** — ULONG_4202020, 4 f32s in [-1, 1] packed into 64
+/// bits across lanes 2-3. Per canary's comment `4_20_20_20 w_z_y_x`:
+/// the high 4 bits of the dword hold W (signed 4-bit); the remaining 60
+/// bits hold 3× 20-bit signed-normalized Z/Y/X. Rare outside very few
+/// titles (canary notes 54540829).
+pub fn pack_normpacked64(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    #[inline]
+    fn n20(x: f32) -> u64 {
+        let c = x.clamp(-1.0, 1.0) * 524287.0; // 2^19 - 1
+        let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
+        (r.clamp(-524288, 524287) as i64 as u64) & 0xF_FFFF
+    }
+    #[inline]
+    fn n4(x: f32) -> u64 {
+        let c = x.clamp(-1.0, 1.0) * 7.0;
+        let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
+        (r.clamp(-8, 7) as i64 as u64) & 0xF
+    }
+    let x = n20(f[0]);
+    let y = n20(f[1]);
+    let z = n20(f[2]);
+    let w = n4(f[3]);
+    let dw: u64 = (w << 60) | (z << 40) | (y << 20) | x;
+    Vec128::from_u32x4(0, 0, (dw >> 32) as u32, dw as u32)
+}
+
+pub fn unpack_normpacked64(v: Vec128) -> Vec128 {
+    let hi = v.u32x4(2) as u64;
+    let lo = v.u32x4(3) as u64;
+    let dw = (hi << 32) | lo;
+    #[inline]
+    fn u20_to_norm(bits: u64) -> f32 {
+        let s = ((bits & 0xF_FFFF) as i64) << 44 >> 44;
+        (s as f32) / 524287.0
+    }
+    #[inline]
+    fn u4_to_norm(bits: u64) -> f32 {
+        let s = ((bits & 0xF) as i64) << 60 >> 60;
+        (s as f32) / 7.0
+    }
+    let x = u20_to_norm(dw);
+    let y = u20_to_norm(dw >> 20);
+    let z = u20_to_norm(dw >> 40);
+    let w = u4_to_norm(dw >> 60);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+/// IEEE 754 half-precision float pack/unpack — used by both FLOAT16_2
+/// and FLOAT16_4. No FMA quirks involved; we go via `f32::to_bits` and
+/// manual bit-twiddling (the stable-Rust `f16` type isn't available
+/// yet).
+#[inline]
+fn f32_to_f16_bits(f: f32) -> u16 {
+    let bits = f.to_bits();
+    let sign = ((bits >> 31) & 0x1) as u16;
+    let exp = ((bits >> 23) & 0xFF) as i32;
+    let mant = bits & 0x7FFFFF;
+    // Handle the easy cases first.
+    if exp == 0xFF {
+        // NaN or infinity.
+        let half_exp = 0x1F;
+        let half_mant = if mant != 0 { 0x200 } else { 0 }; // quiet NaN / zero mantissa for Inf
+        return (sign << 15) | (half_exp << 10) | half_mant;
+    }
+    let unbiased_exp = exp - 127;
+    if unbiased_exp >= 16 {
+        // Overflow → infinity.
+        return (sign << 15) | (0x1F << 10);
+    }
+    if unbiased_exp <= -15 {
+        // Denormal or zero. Compute the shift and subnormal mantissa;
+        // anything too small flushes to signed zero.
+        if unbiased_exp < -24 {
+            return sign << 15;
+        }
+        let shift = -14 - unbiased_exp as i32; // amount to shift the implicit-1'd mantissa
+        let full_mant = 0x800000 | mant; // 24 bits with implicit leading 1
+        let half_mant = (full_mant >> (shift + 13)) as u16;
+        return (sign << 15) | half_mant;
+    }
+    let half_exp = ((unbiased_exp + 15) as u16) & 0x1F;
+    let half_mant = (mant >> 13) as u16;
+    (sign << 15) | (half_exp << 10) | half_mant
+}
+
+#[inline]
+fn f16_bits_to_f32(h: u16) -> f32 {
+    let sign = ((h >> 15) & 0x1) as u32;
+    let exp = ((h >> 10) & 0x1F) as i32;
+    let mant = (h & 0x3FF) as u32;
+    let bits = if exp == 0x1F {
+        // NaN or infinity.
+        let f32_exp = 0xFFu32;
+        let f32_mant = if mant != 0 { 0x400000 } else { 0 };
+        (sign << 31) | (f32_exp << 23) | f32_mant
+    } else if exp == 0 && mant == 0 {
+        // Signed zero.
+        sign << 31
+    } else if exp == 0 {
+        // Subnormal — renormalize.
+        let mut e = -14i32;
+        let mut m = mant;
+        while (m & 0x400) == 0 {
+            m <<= 1;
+            e -= 1;
+        }
+        let f32_exp = ((e + 127) as u32) & 0xFF;
+        let f32_mant = (m & 0x3FF) << 13;
+        (sign << 31) | (f32_exp << 23) | f32_mant
+    } else {
+        let f32_exp = ((exp - 15 + 127) as u32) & 0xFF;
+        let f32_mant = mant << 13;
+        (sign << 31) | (f32_exp << 23) | f32_mant
+    };
+    f32::from_bits(bits)
+}
+
+/// **FLOAT16_2** — two 32-bit floats → two half-floats packed into one
+/// 32-bit word (X in high 16 bits of lane 3, Y in low 16).
+pub fn pack_float16_2(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = f32_to_f16_bits(f[0]) as u32;
+    let y = f32_to_f16_bits(f[1]) as u32;
+    Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
+}
+
+pub fn unpack_float16_2(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    let x = f16_bits_to_f32((word >> 16) as u16);
+    let y = f16_bits_to_f32(word as u16);
+    Vec128::from_f32x4(x, y, 0.0, 1.0)
+}
+
+/// **FLOAT16_4** — four 32-bit floats → four half-floats packed across
+/// 64 bits (lanes 2-3).
+pub fn pack_float16_4(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = f32_to_f16_bits(f[0]) as u32;
+    let y = f32_to_f16_bits(f[1]) as u32;
+    let z = f32_to_f16_bits(f[2]) as u32;
+    let w = f32_to_f16_bits(f[3]) as u32;
+    Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
+}
+
+pub fn unpack_float16_4(v: Vec128) -> Vec128 {
+    let hi = v.u32x4(2);
+    let lo = v.u32x4(3);
+    let x = f16_bits_to_f32((hi >> 16) as u16);
+    let y = f16_bits_to_f32(hi as u16);
+    let z = f16_bits_to_f32((lo >> 16) as u16);
+    let w = f16_bits_to_f32(lo as u16);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+// ─── CR6 helpers used by integer compares ─────────────────────────────────
+// vcmp*. (record-form) updates CR6 in a compressed form:
+//   CR6 = {all-true, 0, all-false, 0}
+// where each bit reflects the per-lane mask across the whole register.
+
+#[inline] pub fn cr6_flags_from_mask(mask: Vec128) -> (bool, bool) {
+    let b = mask.as_bytes();
+    let mut any_set = false;
+    let mut any_clear = false;
+    for &byte in b.iter() {
+        if byte != 0 { any_set = true; }
+        if byte != 0xFF { any_clear = true; }
+    }
+    let all_true = !any_clear;
+    let all_false = !any_set;
+    (all_true, all_false)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::cell::Cell;
+
+    struct TestMem { data: Box<[Cell<u8>]> }
+    impl TestMem {
+        fn new(size: usize) -> Self {
+            Self { data: (0..size).map(|_| Cell::new(0)).collect() }
+        }
+    }
+    impl MemoryAccess for TestMem {
+        fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
+        fn read_u16(&self, a: u32) -> u16 {
+            u16::from_be_bytes([self.data[a as usize].get(), self.data[a as usize + 1].get()])
+        }
+        fn read_u32(&self, a: u32) -> u32 {
+            let a = a as usize;
+            u32::from_be_bytes([
+                self.data[a].get(), self.data[a+1].get(),
+                self.data[a+2].get(), self.data[a+3].get(),
+            ])
+        }
+        fn read_u64(&self, a: u32) -> u64 {
+            let a = a as usize;
+            u64::from_be_bytes([
+                self.data[a].get(), self.data[a+1].get(),
+                self.data[a+2].get(), self.data[a+3].get(),
+                self.data[a+4].get(), self.data[a+5].get(),
+                self.data[a+6].get(), self.data[a+7].get(),
+            ])
+        }
+        fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
+        fn write_u16(&self, a: u32, v: u16) {
+            let b = v.to_be_bytes();
+            self.data[a as usize].set(b[0]);
+            self.data[a as usize + 1].set(b[1]);
+        }
+        fn write_u32(&self, a: u32, v: u32) {
+            let b = v.to_be_bytes(); let a = a as usize;
+            for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
+        }
+        fn write_u64(&self, a: u32, v: u64) {
+            let b = v.to_be_bytes(); let a = a as usize;
+            for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
+        }
+        fn translate(&self, _a: u32) -> Option<*const u8> { None }
+        fn translate_mut(&self, _a: u32) -> Option<*mut u8> { None }
+    }
+
+    #[test]
+    fn lvlx_lvrx_round_trip() {
+        let m = TestMem::new(0x40);
+        for i in 0..0x30 { m.data[i].set((i as u8).wrapping_add(0x10)); }
+        // Unaligned load from 0x13 should combine lvlx(0x13) | lvrx(0x23).
+        let lo = load_vector_left(&m, 0x13);
+        let hi = load_vector_right(&m, 0x23);
+        let mut combined = [0u8; 16];
+        let lob = lo.as_bytes();
+        let hib = hi.as_bytes();
+        for i in 0..16 { combined[i] = lob[i] | hib[i]; }
+        for i in 0..16 {
+            assert_eq!(combined[i], m.data[0x13 + i].get(), "lane {}", i);
+        }
+    }
+
+    #[test]
+    fn lvlx_aligned_is_full_load() {
+        let m = TestMem::new(0x20);
+        for i in 0..0x20 { m.data[i].set(i as u8); }
+        let v = load_vector_left(&m, 0x10);
+        let b = v.as_bytes();
+        for i in 0..16 { assert_eq!(b[i], 0x10 + i as u8); }
+    }
+
+    #[test]
+    fn lvrx_aligned_is_zero() {
+        let m = TestMem::new(0x20);
+        let v = load_vector_right(&m, 0x10);
+        assert_eq!(v.as_bytes(), [0u8; 16]);
+    }
+
+    #[test]
+    fn sat_add_signed_overflow() {
+        assert_eq!(sat_add_i8(120, 10), (127, true));
+        assert_eq!(sat_add_i8(-120, -10), (-128, true));
+        assert_eq!(sat_add_i8(1, 2), (3, false));
+    }
+
+    #[test]
+    fn sat_sub_unsigned_underflow() {
+        assert_eq!(sat_sub_u8(5, 10), (0, true));
+        assert_eq!(sat_sub_u8(10, 5), (5, false));
+    }
+
+    #[test]
+    fn pack_unpack_pixel_555() {
+        let encoded = pack_pixel_555(0x80_F8_F8_F8);
+        assert_eq!(encoded & 0x8000, 0x8000);
+        let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F);
+        assert_eq!(w & 0xFF000000, 0xFF000000);
+    }
+
+    // ─── First-Pixels M3 pack/unpack roundtrip tests ───
+
+    /// Quantization error tolerance for N-bit signed normalized values.
+    /// `1.0 / ((1 << (bits - 1)) - 1)` is the step size.
+    fn tol_normalized(bits: u32) -> f32 {
+        1.0 / ((1u32 << (bits - 1)) - 1) as f32
+    }
+
+    #[test]
+    fn normshort2_roundtrip() {
+        let v = Vec128::from_f32x4(0.5, -0.75, 0.0, 0.0);
+        let packed = pack_normshort2(v);
+        let back = unpack_normshort2(packed).as_f32x4();
+        let tol = tol_normalized(16);
+        assert!((back[0] - 0.5).abs() < tol,  "x got {}", back[0]);
+        assert!((back[1] - -0.75).abs() < tol, "y got {}", back[1]);
+        assert_eq!(back[2], 0.0);
+        assert_eq!(back[3], 1.0);
+    }
+
+    #[test]
+    fn normshort4_roundtrip_extremes() {
+        let v = Vec128::from_f32x4(1.0, -1.0, 0.0, 0.25);
+        let packed = pack_normshort4(v);
+        let back = unpack_normshort4(packed).as_f32x4();
+        let tol = tol_normalized(16);
+        assert!((back[0] - 1.0).abs() < tol);
+        assert!((back[1] - -1.0).abs() < tol);
+        assert!((back[2] - 0.0).abs() < tol);
+        assert!((back[3] - 0.25).abs() < tol);
+    }
+
+    #[test]
+    fn normpacked32_roundtrip() {
+        let v = Vec128::from_f32x4(0.5, -0.5, 0.9, -1.0);
+        let packed = pack_normpacked32(v);
+        let back = unpack_normpacked32(packed).as_f32x4();
+        let tol10 = tol_normalized(10);
+        let tol2 = tol_normalized(2);
+        assert!((back[0] - 0.5).abs() < tol10,  "x got {}", back[0]);
+        assert!((back[1] - -0.5).abs() < tol10, "y got {}", back[1]);
+        assert!((back[2] - 0.9).abs() < tol10,  "z got {}", back[2]);
+        // 2-bit signed quantizes to {-1, -0.5-ish, 0, 0.5-ish}; tolerance
+        // is the full step.
+        assert!((back[3] - -1.0).abs() < 2.0 * tol2, "w got {}", back[3]);
+    }
+
+    #[test]
+    fn normpacked64_roundtrip() {
+        let v = Vec128::from_f32x4(0.5, -0.25, 0.75, 0.5);
+        let packed = pack_normpacked64(v);
+        let back = unpack_normpacked64(packed).as_f32x4();
+        let tol20 = tol_normalized(20);
+        let tol4 = tol_normalized(4);
+        assert!((back[0] - 0.5).abs() < tol20,   "x got {}", back[0]);
+        assert!((back[1] - -0.25).abs() < tol20, "y got {}", back[1]);
+        assert!((back[2] - 0.75).abs() < tol20,  "z got {}", back[2]);
+        assert!((back[3] - 0.5).abs() < tol4,    "w got {}", back[3]);
+    }
+
+    #[test]
+    fn float16_2_roundtrip_normals() {
+        // Half has ~3 decimal digits of precision. Pick values that
+        // survive conversion cleanly: powers of 2 + simple fractions.
+        let v = Vec128::from_f32x4(1.0, -2.5, 0.0, 0.0);
+        let packed = pack_float16_2(v);
+        let back = unpack_float16_2(packed).as_f32x4();
+        assert_eq!(back[0], 1.0);
+        assert_eq!(back[1], -2.5);
+        assert_eq!(back[2], 0.0);
+        assert_eq!(back[3], 1.0);
+    }
+
+    #[test]
+    fn float16_4_roundtrip_normals() {
+        let v = Vec128::from_f32x4(0.5, -3.0, 16.0, -0.125);
+        let packed = pack_float16_4(v);
+        let back = unpack_float16_4(packed).as_f32x4();
+        assert_eq!(back[0], 0.5);
+        assert_eq!(back[1], -3.0);
+        assert_eq!(back[2], 16.0);
+        assert_eq!(back[3], -0.125);
+    }
+
+    #[test]
+    fn float16_handles_zero_and_infinity() {
+        // Zero should survive.
+        assert_eq!(f16_bits_to_f32(f32_to_f16_bits(0.0)), 0.0);
+        assert_eq!(f16_bits_to_f32(f32_to_f16_bits(-0.0)).to_bits(), (-0.0f32).to_bits());
+        // +inf.
+        let inf_back = f16_bits_to_f32(f32_to_f16_bits(f32::INFINITY));
+        assert!(inf_back.is_infinite() && inf_back > 0.0);
+        // Overflow → +inf.
+        let overflow_back = f16_bits_to_f32(f32_to_f16_bits(65536.0));
+        assert!(overflow_back.is_infinite());
+    }
+
+    #[test]
+    fn pack_type_enum_maps_canary_values() {
+        use D3dPackType::*;
+        assert!(matches!(D3dPackType::from_immediate(0), D3dColor));
+        assert!(matches!(D3dPackType::from_immediate(1), NormShort2));
+        assert!(matches!(D3dPackType::from_immediate(2), NormPacked32));
+        assert!(matches!(D3dPackType::from_immediate(3), Float16_2));
+        assert!(matches!(D3dPackType::from_immediate(4), NormShort4));
+        assert!(matches!(D3dPackType::from_immediate(5), Float16_4));
+        assert!(matches!(D3dPackType::from_immediate(6), NormPacked64));
+        assert!(matches!(D3dPackType::from_immediate(7), Other(7)));
+    }
+}