From c36cca14f9317374795e3ccff9ca3042888acbef Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 1 May 2026 16:27:43 +0200
Subject: [PATCH] xenia-cpu: VMX128, FPSCR, decoder split, scheduler,
 decode/block caches

Split the monolithic interpreter into cohesive modules: dedicated
decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables
(opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs);
overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode
cache and basic-block cache (block_cache.rs); and a full VMX/VMX128
implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions.

Add the parallel-execution substrate behind --parallel: a 7-party
phaser (phaser.rs) for round-based barrier sync, ReservationTable
(reservation.rs) for guest LL/SC, and the per-HW-thread scheduler
core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs.

Disassembler is now the single source of truth: disasm.rs gains the
full base + extended + VMX128 mnemonic set, with golden JSON fixtures
and a disasm_goldens test suite. Add a criterion-style interpreter
bench. context.rs grows the per-thread state the new modules need
(reservation slot, FPSCR, vector regs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/xenia-cpu/Cargo.toml                   |    8 +
 crates/xenia-cpu/benches/interpreter.rs       |  194 +
 crates/xenia-cpu/src/block_cache.rs           |  423 ++
 crates/xenia-cpu/src/context.rs               |   99 +-
 crates/xenia-cpu/src/decoder.rs               |  288 ++
 crates/xenia-cpu/src/disasm.rs                | 2071 ++++++++--
 crates/xenia-cpu/src/fpscr.rs                 |  384 ++
 crates/xenia-cpu/src/interpreter.rs           | 3441 ++++++++++++++++-
 crates/xenia-cpu/src/lib.rs                   |   16 +
 crates/xenia-cpu/src/opcode.rs                |   84 +
 crates/xenia-cpu/src/overflow.rs              |  173 +
 crates/xenia-cpu/src/phaser.rs                |  345 ++
 crates/xenia-cpu/src/reservation.rs           |  424 ++
 crates/xenia-cpu/src/scheduler.rs             | 1919 +++++++++
 crates/xenia-cpu/src/trap.rs                  |   95 +
 crates/xenia-cpu/src/vmx.rs                   |  918 +++++
 crates/xenia-cpu/tests/disasm_goldens.rs      |  531 +++
 .../tests/golden/base_mnemonics.json          |  571 +++
 .../tests/golden/extended_mnemonics.json      |  621 +++
 .../tests/golden/vmx128_registers.json        |  137 +
 20 files changed, 12284 insertions(+), 458 deletions(-)
 create mode 100644 crates/xenia-cpu/benches/interpreter.rs
 create mode 100644 crates/xenia-cpu/src/block_cache.rs
 create mode 100644 crates/xenia-cpu/src/fpscr.rs
 create mode 100644 crates/xenia-cpu/src/overflow.rs
 create mode 100644 crates/xenia-cpu/src/phaser.rs
 create mode 100644 crates/xenia-cpu/src/reservation.rs
 create mode 100644 crates/xenia-cpu/src/scheduler.rs
 create mode 100644 crates/xenia-cpu/src/trap.rs
 create mode 100644 crates/xenia-cpu/src/vmx.rs
 create mode 100644 crates/xenia-cpu/tests/disasm_goldens.rs
 create mode 100644 crates/xenia-cpu/tests/golden/base_mnemonics.json
 create mode 100644 crates/xenia-cpu/tests/golden/extended_mnemonics.json
 create mode 100644 crates/xenia-cpu/tests/golden/vmx128_registers.json

diff --git a/crates/xenia-cpu/Cargo.toml b/crates/xenia-cpu/Cargo.toml
index 3ca488b..63b9166 100644
--- a/crates/xenia-cpu/Cargo.toml
+++ b/crates/xenia-cpu/Cargo.toml
@@ -10,3 +10,11 @@ xenia-memory = { workspace = true }
 tracing = { workspace = true }
 bitflags = { workspace = true }
 thiserror = { workspace = true }
+
+[dev-dependencies]
+serde = { workspace = true }
+serde_json = { workspace = true }
+
+[[bench]]
+name = "interpreter"
+harness = false
diff --git a/crates/xenia-cpu/benches/interpreter.rs b/crates/xenia-cpu/benches/interpreter.rs
new file mode 100644
index 0000000..7efd251
--- /dev/null
+++ b/crates/xenia-cpu/benches/interpreter.rs
@@ -0,0 +1,194 @@
+//! Interpreter throughput micro-benchmarks.
+//!
+//! Custom `harness = false` main — no extra dev-deps. Run via
+//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`).
+//!
+//! Three workloads, each measuring `step_cached` throughput in MIPS:
+//!
+//!   - `tight_alu_loop`  — pure dispatch + ALU + decode-cache hit.
+//!   - `loadstore_loop`  — alternating `lwz`/`stw` against main RAM. Stresses
+//!                         every load/store path and `find_mmio` dispatch.
+//!   - `mmio_storm`      — same shape as `loadstore_loop` but the address is
+//!                         in a registered MMIO aperture. Sanity-checks that
+//!                         MMIO writes still dispatch correctly.
+//!
+//! These are not statistically rigorous — no warmup, no variance — they're
+//! just enough to detect 2x-class wins or regressions on the perf-track
+//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go
+//! into commit messages; there is no automated baseline file.
+
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::Instant;
+
+use xenia_cpu::context::PpcContext;
+use xenia_cpu::decoder::DecodeCache;
+use xenia_cpu::interpreter::{step_cached, StepResult};
+use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion};
+use xenia_memory::page_table::MemoryProtect;
+
+// PPC instruction encoders — minimal subset needed by the benches.
+
+#[inline]
+fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
+    (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
+}
+
+#[inline]
+fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 {
+    (32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
+}
+
+#[inline]
+fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 {
+    (36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
+}
+
+/// Set up a `GuestMemory` with one writable region for code+data.
+fn make_mem(code_base: u32, code_size: u32) -> GuestMemory {
+    let mut mem = GuestMemory::new().expect("reserve 4GB");
+    mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE)
+        .expect("alloc bench region");
+    mem
+}
+
+/// Write a sequence of raw PPC instructions starting at `base`.
+fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) {
+    for (i, &raw) in instrs.iter().enumerate() {
+        mem.write_u32(base + (i as u32 * 4), raw);
+    }
+}
+
+/// Run `total_instrs` interpreter steps over a program of length `n`,
+/// wrapping PC back to `base` whenever it falls off the end. Returns the
+/// elapsed wall time.
+fn run_loop(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    cache: &mut DecodeCache,
+    base: u32,
+    n: u32,
+    total_instrs: u64,
+) -> std::time::Duration {
+    let end = base + n * 4;
+    ctx.pc = base;
+    let t0 = Instant::now();
+    for _ in 0..total_instrs {
+        let pv = mem.page_version(ctx.pc);
+        let r = step_cached(ctx, mem, cache, pv);
+        debug_assert!(matches!(r, StepResult::Continue));
+        if ctx.pc >= end {
+            ctx.pc = base;
+        }
+    }
+    t0.elapsed()
+}
+
+fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) {
+    let secs = elapsed.as_secs_f64();
+    let mips = (total_instrs as f64) / secs / 1.0e6;
+    println!(
+        "{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS",
+        label, total_instrs, secs, mips
+    );
+}
+
+fn bench_tight_alu_loop() {
+    const BASE: u32 = 0x1000;
+    const N: u32 = 256;
+    const TOTAL: u64 = 50_000_000;
+
+    let mut mem = make_mem(BASE, 0x1000);
+    // 256 × `addi r3, r3, 1` — pure register-register, no memory touch
+    // beyond instruction fetch.
+    let prog: Vec<u32> = (0..N).map(|_| enc_addi(3, 3, 1)).collect();
+    write_program(&mut mem, BASE, &prog);
+
+    let mut ctx = PpcContext::new();
+    let mut cache = DecodeCache::new();
+
+    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL);
+    report("tight_alu_loop", TOTAL, elapsed);
+}
+
+fn bench_loadstore_loop() {
+    const CODE_BASE: u32 = 0x1000;
+    const DATA_BASE: u32 = 0x2000;
+    const N: u32 = 256;
+    const TOTAL: u64 = 30_000_000;
+
+    let mut mem = make_mem(CODE_BASE, 0x2000);
+    // 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every
+    // load/store path through `read_u32`/`write_u32` (incl. `find_mmio`).
+    let mut prog = Vec::with_capacity(N as usize);
+    for _ in 0..(N / 2) {
+        prog.push(enc_stw(3, 4, 0));
+        prog.push(enc_lwz(5, 4, 0));
+    }
+    write_program(&mut mem, CODE_BASE, &prog);
+
+    let mut ctx = PpcContext::new();
+    ctx.gpr[3] = 0xDEAD_BEEF;
+    ctx.gpr[4] = DATA_BASE as u64;
+    let mut cache = DecodeCache::new();
+
+    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
+    report("loadstore_loop", TOTAL, elapsed);
+}
+
+fn bench_mmio_storm() {
+    const CODE_BASE: u32 = 0x1000;
+    const MMIO_BASE: u32 = 0xEA00_0000;
+    const N: u32 = 64;
+    // MMIO is slower per access — keep total smaller so the bench stays
+    // under a few seconds.
+    const TOTAL: u64 = 2_000_000;
+
+    let mut mem = make_mem(CODE_BASE, 0x1000);
+
+    let writes = Arc::new(AtomicU64::new(0));
+    let reads = Arc::new(AtomicU32::new(0));
+    let writes_clone = writes.clone();
+    let reads_clone = reads.clone();
+    mem.add_mmio_region(MmioRegion {
+        base_address: MMIO_BASE,
+        mask: 0xFFFF_0000,
+        size: 0x0001_0000,
+        read_callback: Box::new(move |_a| {
+            reads_clone.fetch_add(1, Ordering::Relaxed);
+            0
+        }),
+        write_callback: Box::new(move |_a, _v| {
+            writes_clone.fetch_add(1, Ordering::Relaxed);
+        }),
+    });
+
+    let mut prog = Vec::with_capacity(N as usize);
+    for _ in 0..(N / 2) {
+        prog.push(enc_stw(3, 4, 0));
+        prog.push(enc_lwz(5, 4, 0));
+    }
+    write_program(&mut mem, CODE_BASE, &prog);
+
+    let mut ctx = PpcContext::new();
+    ctx.gpr[3] = 0x1234_5678;
+    ctx.gpr[4] = MMIO_BASE as u64;
+    let mut cache = DecodeCache::new();
+
+    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
+    report("mmio_storm", TOTAL, elapsed);
+
+    // Sanity assertions — silently catch a refactor that breaks MMIO dispatch.
+    let w = writes.load(Ordering::Relaxed);
+    let r = reads.load(Ordering::Relaxed);
+    assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched");
+    assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched");
+}
+
+fn main() {
+    println!("xenia-cpu interpreter bench");
+    println!("  build: {}", if cfg!(debug_assertions) { "debug" } else { "release" });
+    bench_tight_alu_loop();
+    bench_loadstore_loop();
+    bench_mmio_storm();
+}
diff --git a/crates/xenia-cpu/src/block_cache.rs b/crates/xenia-cpu/src/block_cache.rs
new file mode 100644
index 0000000..c4dd5a5
--- /dev/null
+++ b/crates/xenia-cpu/src/block_cache.rs
@@ -0,0 +1,423 @@
+//! Tier-4 perf — basic-block cache for the PPC interpreter.
+//!
+//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction
+//! per slot, indexed by PC. The hot loop still pays the per-instruction
+//! cost of fetching the raw word, hashing the PC into a slot, and
+//! comparing tags. For straight-line code — common in the asset/inflate
+//! loops where Sylpheed boot is currently CPU-bound — the savings of
+//! batching N decoded instructions per slot lookup are linear in block
+//! length.
+//!
+//! ## Shape
+//!
+//! A `DecodedBlock` is a contiguous run of decoded instructions starting
+//! at `start_pc`, ending at the first *block terminator* (any branch,
+//! `sc`, trap, or `Invalid`) or at one of two safety limits:
+//!
+//!  - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost.
+//!  - 4 KiB page boundary stop. A block is fully contained inside a
+//!    single 4 KiB guest page; that means `mem.page_version(start_pc)`
+//!    is sufficient to detect any code-page rewrite that should
+//!    invalidate the block. Without this rule the cache would have to
+//!    walk every spanned page on every hit, which would erase the win.
+//!
+//! ## Invalidation
+//!
+//! Each block stamps the page version at build time. On lookup, if
+//! `mem.page_version(start_pc)` differs from `block.page_version`, the
+//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at
+//! block granularity.
+//!
+//! ## Debugger semantics
+//!
+//! Block dispatch is **opt-in** by the caller. The hot loop in
+//! `xenia-app/src/main.rs` selects the per-instruction path whenever
+//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set.
+//! That's how single-step, breakpoints, in-memory trace, instruction
+//! trace, and branch trace continue to observe every PC: the block
+//! cache simply never runs in those modes.
+
+use crate::decoder::{decode, DecodedInstr};
+use xenia_memory::MemoryAccess;
+
+/// Direct-mapped block-cache slot count. Same shape as
+/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the
+/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the
+/// slot collision rate is negligible.
+const BLOCK_CACHE_SIZE: usize = 1 << 16;
+const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32;
+
+/// Hard cap on instructions per block. Keeps the worst-case memory
+/// footprint bounded and limits the rebuild cost when a code page
+/// gets bumped. 32 instructions is generous for most basic blocks
+/// (real-world average across Sylpheed boot is ~6 between branches).
+pub const MAX_BLOCK_INSTRS: usize = 32;
+
+/// Guest page size — duplicated here to avoid pulling
+/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync
+/// with the memory crate. Both refer to the architectural PowerPC 4 KiB
+/// page granule, so this constant is locked.
+const GUEST_PAGE_SIZE: u32 = 4096;
+const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1);
+
+/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock`
+/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and
+/// stays valid until the next `lookup_or_build` on the same slot.
+#[derive(Debug)]
+pub struct DecodedBlock {
+    /// Guest PC at which this block starts. Used as the slot tag.
+    pub start_pc: u32,
+    /// Guest PC immediately after the last instruction in `instrs`.
+    /// Equal to `instrs.last().addr + 4` whether or not the block
+    /// ended on a terminator. Useful for tracing / disassembly.
+    pub end_pc: u32,
+    /// `mem.page_version(start_pc)` at build time. Mismatch on lookup
+    /// invalidates the block. Single value because every block is
+    /// page-bounded by construction.
+    pub page_version: u64,
+    /// Decoded instructions in execution order. Always non-empty after
+    /// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
+    /// pushes the first decoded word unconditionally).
+    pub instrs: Vec<DecodedInstr>,
+}
+
+/// Per-slot status from a `lookup_or_build` probe. Internal only.
+enum CacheStatus {
+    /// Block at this slot matches `pc` and the page version at build
+    /// time matches `mem.page_version(pc)` — return as-is.
+    Hit,
+    /// Block at this slot matched `pc` but the page version has
+    /// advanced — rebuild and bump `invalidations`.
+    Stale,
+    /// Slot is empty or holds a block keyed at a different `start_pc`.
+    /// Build a fresh block and bump `misses`.
+    Miss,
+}
+
+/// Direct-mapped block cache. One instance shared across all HW slots
+/// (block contents are PC-only and read-only after fill). Not
+/// thread-safe — owner is the single scheduler thread, same as
+/// `DecodeCache`.
+pub struct BlockCache {
+    slots: Box<[Option<Box<DecodedBlock>>]>,
+    hits: u64,
+    misses: u64,
+    invalidations: u64,
+}
+
+impl Default for BlockCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BlockCache {
+    pub fn new() -> Self {
+        // `Option<Box<T>>` is a niche-optimized 8-byte slot; 64 K of
+        // them cost ~512 KiB of cold storage. Live blocks beyond that
+        // sit on the heap.
+        let mut v: Vec<Option<Box<DecodedBlock>>> = Vec::with_capacity(BLOCK_CACHE_SIZE);
+        v.resize_with(BLOCK_CACHE_SIZE, || None);
+        Self {
+            slots: v.into_boxed_slice(),
+            hits: 0,
+            misses: 0,
+            invalidations: 0,
+        }
+    }
+
+    pub fn hits(&self) -> u64 {
+        self.hits
+    }
+    pub fn misses(&self) -> u64 {
+        self.misses
+    }
+    pub fn invalidations(&self) -> u64 {
+        self.invalidations
+    }
+
+    /// Return the cached block starting at `pc`, building it if absent
+    /// or stale. The returned reference is borrowed from the cache and
+    /// stays valid until the next `lookup_or_build` call.
+    pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock {
+        let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize;
+        let cur_pv = mem.page_version(pc);
+
+        // Phase 1: classify the slot. Borrow ends before fill so the
+        // mutable update below doesn't conflict.
+        let status = match &self.slots[idx] {
+            Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit,
+            Some(b) if b.start_pc == pc => CacheStatus::Stale,
+            _ => CacheStatus::Miss,
+        };
+
+        // Phase 2: fill on miss/stale, account.
+        match status {
+            CacheStatus::Hit => {
+                self.hits += 1;
+            }
+            CacheStatus::Stale => {
+                self.invalidations += 1;
+                self.misses += 1;
+                let block = build_block(pc, mem, cur_pv);
+                self.slots[idx] = Some(Box::new(block));
+            }
+            CacheStatus::Miss => {
+                self.misses += 1;
+                let block = build_block(pc, mem, cur_pv);
+                self.slots[idx] = Some(Box::new(block));
+            }
+        }
+
+        // Slot is guaranteed populated at this point — Hit returned a
+        // pre-existing block, Miss/Stale just wrote a new one.
+        self.slots[idx]
+            .as_deref()
+            .expect("block freshly built or hit")
+    }
+}
+
+/// Walk forward from `pc`, decoding instructions and collecting them
+/// into a `DecodedBlock`. The walk stops on the first of:
+///   - a [`PpcOpcode::terminates_block`] true (the terminator IS
+///     included as the last instruction),
+///   - reaching [`MAX_BLOCK_INSTRS`],
+///   - the next PC would cross a 4 KiB guest page boundary.
+fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock {
+    let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
+    let page_base = start_pc & GUEST_PAGE_MASK;
+    let mut cur = start_pc;
+
+    loop {
+        let raw = mem.read_u32(cur);
+        let decoded = decode(raw, cur);
+        let terminates = decoded.opcode.terminates_block();
+        instrs.push(decoded);
+
+        if terminates {
+            break;
+        }
+        if instrs.len() >= MAX_BLOCK_INSTRS {
+            break;
+        }
+        let next = cur.wrapping_add(4);
+        if (next & GUEST_PAGE_MASK) != page_base {
+            break;
+        }
+        cur = next;
+    }
+
+    let last = instrs.last().expect("build pushes at least one instruction");
+    let end_pc = last.addr.wrapping_add(4);
+
+    DecodedBlock {
+        start_pc,
+        end_pc,
+        page_version,
+        instrs,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::opcode::PpcOpcode;
+
+    use std::cell::Cell;
+
+    /// 64 KiB byte-array memory, big-endian word reads.
+    /// Mirrors `interpreter::tests::TestMem` but lives here so block_cache
+    /// tests don't depend on interpreter internals.
+    struct BlockTestMem {
+        data: Box<[Cell<u8>]>,
+        version_a: u64,
+        version_b: u64,
+        // Address of the page whose version is `version_b` instead of
+        // `version_a`. Used to model an out-of-band page-version bump in
+        // the invalidation test without going through write_*.
+        bumped_page: Cell<Option<u32>>,
+    }
+
+    impl BlockTestMem {
+        fn new() -> Self {
+            Self {
+                data: (0..0x10000u32).map(|_| Cell::new(0)).collect(),
+                version_a: 1,
+                version_b: 2,
+                bumped_page: Cell::new(None),
+            }
+        }
+        fn put(&self, addr: u32, raw: u32) {
+            let a = addr as usize;
+            for (i, byte) in raw.to_be_bytes().iter().enumerate() {
+                self.data[a + i].set(*byte);
+            }
+        }
+    }
+
+    impl MemoryAccess for BlockTestMem {
+        fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
+        fn read_u16(&self, a: u32) -> u16 {
+            let i = a as usize;
+            u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()])
+        }
+        fn read_u32(&self, a: u32) -> u32 {
+            let i = a as usize;
+            u32::from_be_bytes([
+                self.data[i].get(), self.data[i + 1].get(),
+                self.data[i + 2].get(), self.data[i + 3].get(),
+            ])
+        }
+        fn read_u64(&self, a: u32) -> u64 {
+            let i = a as usize;
+            u64::from_be_bytes([
+                self.data[i].get(), self.data[i + 1].get(),
+                self.data[i + 2].get(), self.data[i + 3].get(),
+                self.data[i + 4].get(), self.data[i + 5].get(),
+                self.data[i + 6].get(), self.data[i + 7].get(),
+            ])
+        }
+        fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
+        fn write_u16(&self, a: u32, v: u16) {
+            let i = a as usize;
+            let b = v.to_be_bytes();
+            self.data[i].set(b[0]);
+            self.data[i + 1].set(b[1]);
+        }
+        fn write_u32(&self, a: u32, v: u32) {
+            let i = a as usize;
+            for (k, byte) in v.to_be_bytes().iter().enumerate() {
+                self.data[i + k].set(*byte);
+            }
+        }
+        fn write_u64(&self, a: u32, v: u64) {
+            let i = a as usize;
+            for (k, byte) in v.to_be_bytes().iter().enumerate() {
+                self.data[i + k].set(*byte);
+            }
+        }
+        fn translate(&self, _: u32) -> Option<*const u8> { None }
+        fn translate_mut(&self, _: u32) -> Option<*mut u8> { None }
+        fn page_version(&self, addr: u32) -> u64 {
+            if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() {
+                self.version_b
+            } else {
+                self.version_a
+            }
+        }
+    }
+
+    // PPC encodings — minimal subset for these tests.
+    fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
+        (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
+    }
+    fn enc_b_self() -> u32 {
+        // b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0.
+        18 << 26
+    }
+    fn enc_unimplemented() -> u32 {
+        // Use opcode 0 raw = 0; decoder maps to Invalid.
+        0
+    }
+
+    #[test]
+    fn block_built_to_terminator() {
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, enc_addi(3, 3, 1));
+        mem.put(0x108, enc_addi(3, 3, 1));
+        mem.put(0x10C, enc_b_self()); // terminator
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(b.start_pc, 0x100);
+        assert_eq!(b.instrs.len(), 4);
+        assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx);
+        assert_eq!(b.end_pc, 0x110);
+    }
+
+    #[test]
+    fn block_stops_at_page_boundary() {
+        // Build from 0x1FFC. The next PC (0x2000) is in a different
+        // 4 KiB page — block must contain only the one instruction.
+        let mem = BlockTestMem::new();
+        mem.put(0x1FFC, enc_addi(3, 3, 1));
+        mem.put(0x2000, enc_addi(3, 3, 1));
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x1FFC, &mem);
+        assert_eq!(b.instrs.len(), 1);
+        assert_eq!(b.end_pc, 0x2000);
+    }
+
+    #[test]
+    fn block_stops_at_max_len() {
+        // 64 consecutive non-terminator instructions on one page —
+        // block must clamp at MAX_BLOCK_INSTRS.
+        let mem = BlockTestMem::new();
+        for i in 0..64u32 {
+            mem.put(0x100 + i * 4, enc_addi(3, 3, 1));
+        }
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS);
+        assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4);
+    }
+
+    #[test]
+    fn block_stops_at_invalid_opcode() {
+        // Decoder mapping `Invalid` is treated as a block terminator
+        // so the per-instruction Unimplemented path is preserved.
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, enc_unimplemented());
+        mem.put(0x108, enc_addi(3, 3, 1));
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(b.instrs.len(), 2);
+        assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid);
+    }
+
+    #[test]
+    fn block_invalidates_on_page_version_bump() {
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, enc_b_self());
+        let mut bc = BlockCache::new();
+        let _ = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(bc.misses(), 1);
+        assert_eq!(bc.hits(), 0);
+
+        // Same call → hit.
+        let _ = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(bc.hits(), 1);
+        assert_eq!(bc.invalidations(), 0);
+
+        // Bump the page version on the page containing 0x100. Next
+        // lookup must invalidate and rebuild.
+        mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK));
+        let _ = bc.lookup_or_build(0x100, &mem);
+        assert_eq!(bc.invalidations(), 1);
+        assert_eq!(bc.misses(), 2);
+    }
+
+    #[test]
+    fn block_hit_returns_same_contents() {
+        // Sanity: cache hit returns a block whose contents reflect the
+        // ORIGINAL instruction stream, even after a non-version-bumping
+        // poke to the underlying bytes. (No real workload would do
+        // this, but it confirms we're returning cached data, not
+        // re-reading.)
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 7));
+        mem.put(0x104, enc_b_self());
+        let mut bc = BlockCache::new();
+        let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
+        // Rewrite without bumping version (test-only path).
+        let bytes = enc_addi(3, 3, 99).to_be_bytes();
+        for (i, b) in bytes.iter().enumerate() {
+            mem.data[0x100 + i].set(*b);
+        }
+        let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
+        assert_eq!(first_simm, 7);
+        assert_eq!(cached_simm, 7, "cache must serve original decoded form");
+    }
+}
diff --git a/crates/xenia-cpu/src/context.rs b/crates/xenia-cpu/src/context.rs
index b500c50..26b9733 100644
--- a/crates/xenia-cpu/src/context.rs
+++ b/crates/xenia-cpu/src/context.rs
@@ -29,16 +29,37 @@ pub mod spr {
     pub const XER: u32 = 1;
     pub const LR: u32 = 8;
     pub const CTR: u32 = 9;
-    pub const TBL: u32 = 268;
-    pub const TBU: u32 = 269;
+    pub const DSISR: u32 = 18;
+    pub const DAR: u32 = 19;
+    /// Decrementer (hypervisor-visible, 32-bit down-counter).
+    pub const DEC: u32 = 22;
+    pub const TBL: u32 = 268;  // Read (user)
+    pub const TBU: u32 = 269;  // Read (user)
+    /// Time-base write (supervisor). Separate SPR number from TBL (268) for
+    /// access-control reasons.
+    pub const TBL_WRITE: u32 = 284;
+    pub const TBU_WRITE: u32 = 285;
     pub const SPRG0: u32 = 272;
     pub const SPRG1: u32 = 273;
     pub const SPRG2: u32 = 274;
     pub const SPRG3: u32 = 275;
+    pub const VRSAVE: u32 = 256;
     pub const PVR: u32 = 287;
+    pub const HID0: u32 = 1008;
+    pub const HID1: u32 = 1009;
     pub const PIR: u32 = 1023;
 }
 
+/// LR halt sentinel. When `bclr` returns to this address, the interpreter
+/// loop halts cleanly (matches the "entry returned" convention).
+pub const LR_HALT_SENTINEL: u64 = 0xBCBC_BCBC;
+
+/// VSCR NJ (Non-Java mode) bit. Stored in word 3 at bit 16 (mask 0x0001_0000).
+/// Set at startup; when clear, denormals are flushed to zero following IEEE-754.
+pub const VSCR_NJ_MASK: u32 = 0x0001_0000;
+/// VSCR SAT (saturation sticky) bit. Stored in word 3 at bit 31 (mask 0x0000_0001).
+pub const VSCR_SAT_MASK: u32 = 0x0000_0001;
+
 /// PowerPC processor context. Holds all register state for one guest thread.
 /// Mirrors PPCContext from ppc_context.h, minus JIT-specific fields.
 #[repr(C, align(64))]
@@ -64,15 +85,39 @@ pub struct PpcContext {
     pub xer_ca: u8,
     pub xer_ov: u8,
     pub xer_so: u8,
-    // Altivec VSCR saturation bit
-    pub vscr_sat: u8,
+    // Altivec VSCR. Only bits 16 (NJ) and 31 (SAT) of word 3 are meaningful.
+    pub vscr: Vec128,
+    // VRSAVE (SPR 256). Bitmask of which VRs need saving across context switches.
+    pub vrsave: u32,
 
     // Program counter
     pub pc: u32,
-    // Reservation address/value for lwarx/stwcx
-    pub reserved_addr: u32,
+    // Reservation for lwarx/ldarx/stwcx/stdcx. Xenon's reservation granule is
+    // one L2 cache line (128 bytes) — `reserved_line` is stored as the base
+    // address of that line (`ea & !0x7F`). `has_reservation` gates the
+    // validity; stwcx./stdcx. check that both match before committing.
+    // `reserved_val` is retained for possible future use by a coherency
+    // observer; the store-conditional logic itself does not compare it.
+    pub reserved_line: u32,
     pub reserved_val: u64,
     pub has_reservation: bool,
+    /// M3.7 — generation stamp returned by [`crate::ReservationTable::reserve`]
+    /// at the most recent `lwarx`/`ldarx`. Paired with `reserved_line`;
+    /// `stwcx.`/`stdcx.` pass this back to `try_commit`. Meaningful only
+    /// when `reservation_table` is `Some` and the table is enabled.
+    pub reserved_generation: u32,
+    /// M3.7 — optional handle to the inter-thread reservation table.
+    /// When `Some(table)` *and* `table.is_enabled()`, the interpreter's
+    /// `lwarx`/`stwcx.`/`ldarx`/`stdcx.` arms route through the table;
+    /// otherwise they use the legacy per-`PpcContext` fields above. The
+    /// scheduler populates this when it spawns a thread under a kernel
+    /// that has `reservations` set.
+    pub reservation_table: Option<std::sync::Arc<crate::ReservationTable>>,
+    /// M3.7 — emulated HW slot ID this thread is bound to. Used as the
+    /// reservation table's `hw_id` discriminator so two threads on
+    /// different slots can't accidentally commit each other's
+    /// reservations. Populated by the scheduler at spawn / migration.
+    pub hw_id: u8,
 
     // Thread ID (for kernel use)
     pub thread_id: u32,
@@ -82,6 +127,12 @@ pub struct PpcContext {
 
     // Time base (incremented each instruction for debugging)
     pub timebase: u64,
+
+    // Decrementer (SPR 22): 32-bit down-counter that fires an external
+    // interrupt at underflow on real hw. Xenia-rs doesn't dispatch DEC
+    // interrupts to the guest; this value is maintained so that mfspr DEC
+    // returns something coherent.
+    pub dec: u32,
 }
 
 impl PpcContext {
@@ -89,7 +140,9 @@ impl PpcContext {
         Self {
             gpr: [0; 32],
             ctr: 0,
-            lr: 0,
+            // Canary sets LR to the halt sentinel at thread start so `blr`
+            // from the top-level entry falls out of the interpreter loop.
+            lr: LR_HALT_SENTINEL,
             msr: 0,
             fpr: [0.0; 32],
             vr: [Vec128::ZERO; 128],
@@ -98,14 +151,21 @@ impl PpcContext {
             xer_ca: 0,
             xer_ov: 0,
             xer_so: 0,
-            vscr_sat: 0,
+            // VSCR starts with NJ bit set (denormals flushed) — matches canary
+            // thread_state.cc initialization.
+            vscr: Vec128::from_u32x4(0, 0, 0, VSCR_NJ_MASK),
+            vrsave: 0xFFFF_FFFF,
             pc: 0,
-            reserved_addr: 0,
+            reserved_line: 0,
             reserved_val: 0,
             has_reservation: false,
+            reserved_generation: 0,
+            reservation_table: None,
+            hw_id: 0,
             thread_id: 0,
             cycle_count: 0,
             timebase: 0,
+            dec: 0,
         }
     }
 
@@ -182,6 +242,27 @@ impl PpcContext {
         self.xer_ov = ((val >> 30) & 1) as u8;
         self.xer_ca = ((val >> 29) & 1) as u8;
     }
+
+    /// Read the VSCR SAT (sticky saturation) bit.
+    pub fn vscr_sat(&self) -> bool {
+        (self.vscr.u32x4(3) & VSCR_SAT_MASK) != 0
+    }
+
+    /// Set or clear VSCR SAT. Preserves the NJ bit (and any other word-3 bits).
+    pub fn set_vscr_sat(&mut self, v: bool) {
+        let mut w = self.vscr.u32x4(3);
+        if v {
+            w |= VSCR_SAT_MASK;
+        } else {
+            w &= !VSCR_SAT_MASK;
+        }
+        self.vscr.set_u32x4(3, w);
+    }
+
+    /// Read the VSCR NJ (non-Java mode / flush-denormals) bit.
+    pub fn vscr_nj(&self) -> bool {
+        (self.vscr.u32x4(3) & VSCR_NJ_MASK) != 0
+    }
 }
 
 impl Default for PpcContext {
diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs
index c84ddca..136d3e3 100644
--- a/crates/xenia-cpu/src/decoder.rs
+++ b/crates/xenia-cpu/src/decoder.rs
@@ -77,6 +77,9 @@ impl DecodedInstr {
     /// OE bit (bit 21) - overflow enable
     #[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 }
 
+    /// TO field (bits 6-10) for tw/twi/td/tdi trap instructions.
+    #[inline] pub fn to(&self) -> u32 { extract_bits(self.raw, 6, 10) }
+
     /// MB, ME fields for rotate instructions
     #[inline] pub fn mb(&self) -> u32 { extract_bits(self.raw, 21, 25) }
     #[inline] pub fn me(&self) -> u32 { extract_bits(self.raw, 26, 30) }
@@ -142,6 +145,24 @@ impl DecodedInstr {
     #[inline] pub fn nb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
 }
 
+/// Extract the 5-bit `UIMM` (`VX128_3`) / `IMM` (`VX128_4`) field. Canary
+/// packs both formats with LSB-bits 16-20 holding the field, which is
+/// MSB bits 11-15 in our `extract_bits` convention. For `vpkd3d128` /
+/// `vupkd3d128` the decoded selector is `type = UIMM >> 2` (3 bits; valid
+/// values 0-6 per [`crate::vmx::D3dPackType`], 7 is undocumented /
+/// undefined in canary) and `pack = UIMM & 0x3` (output-slot layout for
+/// `vpkd3d128` only, `vupkd3d128` ignores it).
+///
+/// First-Pixels M3: the interpreter previously used a hand-rolled
+/// `(instr.raw >> 6) & 0x7` that was **LSB-numbered** and extracted
+/// bits from a completely different part of the word (the
+/// secondary-opcode region). Centralizing the extractor here matches
+/// canary's `FormatVX128_{3,4}::{UIMM,IMM}` field semantics exactly.
+#[inline]
+pub fn extract_vx128_uimm5(raw: u32) -> u32 {
+    extract_bits(raw, 11, 15)
+}
+
 /// Decode a 32-bit PPC instruction into its opcode.
 /// Direct translation of the C++ LookupOpcode from ppc_opcode_lookup_gen.cc.
 pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
@@ -149,6 +170,123 @@ pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
     DecodedInstr { opcode, raw, addr }
 }
 
+// Perf tier-2 — direct-mapped PC-keyed decode cache.
+//
+// The interpreter hot path spends ~15-25% of its time in `decode()`
+// parsing the raw u32 and walking the primary+secondary opcode tables.
+// For non-self-modifying guest code — the common case past the XEX
+// loader — `decode(raw, pc)` is purely a function of `(raw, pc)` and
+// the output is `Copy + 16B`. A direct-mapped cache indexed by
+// `(pc >> 2) & MASK` gives the interpreter a 1-comparison fast path,
+// at the cost of one branch and a 1.5 MiB region of memory.
+//
+// Invalidation piggybacks on `xenia_memory::GuestMemory::page_version`
+// (P5 texture-cache invalidation): every cache entry carries the page
+// version that was active at decode time; on lookup we compare against
+// the current version of the containing 4 KiB page. Any write to the
+// page bumps the counter, so the next decode on that PC is a miss that
+// refills.
+
+/// Number of direct-mapped entries. 2^16 = 65,536 slots, one PPC
+/// instruction address per slot — enough for every hot code path in a
+/// typical Xbox 360 title to stay resident without collision.
+const DECODE_CACHE_SIZE: usize = 1 << 16;
+const DECODE_CACHE_MASK: u32 = (DECODE_CACHE_SIZE - 1) as u32;
+
+#[derive(Clone, Copy)]
+struct DecodeCacheEntry {
+    /// Guest PC this entry was decoded at. Used as the tag on lookup; a
+    /// mismatch means the slot was last populated by a different PC that
+    /// shares the same low-16 index.
+    pc: u32,
+    /// Page version at decode time (from `GuestMemory::page_version(pc)`).
+    /// Zero means "unused slot" since real page versions start at 1.
+    page_version: u64,
+    decoded: DecodedInstr,
+}
+
+impl DecodeCacheEntry {
+    const fn empty() -> Self {
+        // `Invalid` is the decoder's "unrecognized opcode" sentinel; we
+        // use it here as the empty-slot marker. Real misses compare `pc`,
+        // not the opcode, so the sentinel choice is cosmetic.
+        Self {
+            pc: 0,
+            page_version: 0,
+            decoded: DecodedInstr {
+                opcode: PpcOpcode::Invalid,
+                raw: 0,
+                addr: 0,
+            },
+        }
+    }
+}
+
+/// Direct-mapped PC-keyed decode cache. One instance shared across all
+/// HW threads (PC is thread-independent; entries are read-only once
+/// filled). Not thread-safe — the single scheduler thread owns it.
+pub struct DecodeCache {
+    slots: Box<[DecodeCacheEntry]>,
+    hits: u64,
+    misses: u64,
+    invalidations: u64,
+}
+
+impl Default for DecodeCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl DecodeCache {
+    pub fn new() -> Self {
+        Self {
+            slots: vec![DecodeCacheEntry::empty(); DECODE_CACHE_SIZE].into_boxed_slice(),
+            hits: 0,
+            misses: 0,
+            invalidations: 0,
+        }
+    }
+
+    /// Look up (or fill) the decoded form of the instruction at `pc`.
+    /// `raw` is the fetched instruction word; `current_page_version` is
+    /// `mem.page_version(pc)` — the caller has it cheaper than we do,
+    /// since they're already touching `mem` to fetch `raw`.
+    #[inline]
+    pub fn lookup(&mut self, pc: u32, raw: u32, current_page_version: u64) -> DecodedInstr {
+        let idx = ((pc >> 2) & DECODE_CACHE_MASK) as usize;
+        // Safety: `idx` is masked into `[0, DECODE_CACHE_SIZE)` so the
+        // slice access is always in-bounds. Opt-out of the bounds check
+        // for the hot path.
+        let entry = unsafe { self.slots.get_unchecked_mut(idx) };
+        if entry.pc == pc && entry.page_version == current_page_version {
+            self.hits += 1;
+            return entry.decoded;
+        }
+        if entry.pc == pc && entry.page_version != current_page_version {
+            self.invalidations += 1;
+        }
+        self.misses += 1;
+        let decoded = decode(raw, pc);
+        *entry = DecodeCacheEntry {
+            pc,
+            page_version: current_page_version,
+            decoded,
+        };
+        decoded
+    }
+
+    pub fn hits(&self) -> u64 {
+        self.hits
+    }
+    pub fn misses(&self) -> u64 {
+        self.misses
+    }
+    pub fn invalidations(&self) -> u64 {
+        self.invalidations
+    }
+}
+
 fn lookup_opcode(code: u32) -> PpcOpcode {
     match extract_bits(code, 0, 5) {
         2 => PpcOpcode::tdi,
@@ -781,6 +919,57 @@ mod tests {
         assert_eq!(instr.d(), 0x20);
     }
 
+    #[test]
+    fn decode_cache_miss_fills_then_hit() {
+        let mut cache = DecodeCache::new();
+        let raw: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
+        let pc = 0x8200_0000u32;
+        let first = cache.lookup(pc, raw, 1);
+        assert_eq!(first.opcode, PpcOpcode::addi);
+        assert_eq!(cache.hits(), 0);
+        assert_eq!(cache.misses(), 1);
+        // Same pc, same version → cache hit, no new decode.
+        let second = cache.lookup(pc, raw, 1);
+        assert_eq!(second.opcode, PpcOpcode::addi);
+        assert_eq!(cache.hits(), 1);
+        assert_eq!(cache.misses(), 1);
+    }
+
+    #[test]
+    fn decode_cache_stale_version_refills() {
+        let mut cache = DecodeCache::new();
+        // First fill with an `addi`.
+        let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
+        let pc = 0x8200_0000u32;
+        cache.lookup(pc, raw_addi, 1);
+        // Guest rewrote the page: same pc, different raw + bumped version.
+        // Cache must refill — not return the stale `addi`.
+        let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
+        let refreshed = cache.lookup(pc, raw_lwz, 2);
+        assert_eq!(refreshed.opcode, PpcOpcode::lwz);
+        assert_eq!(cache.invalidations(), 1);
+        assert_eq!(cache.misses(), 2);
+    }
+
+    #[test]
+    fn decode_cache_pc_collision_refills() {
+        // Two PCs that hash to the same slot (pc >> 2 low 16 bits equal)
+        // must not alias. Slot index = ((pc >> 2) & 0xFFFF) — pick two
+        // PCs 4 * 2^16 bytes apart.
+        let mut cache = DecodeCache::new();
+        let pc_a = 0x8200_0000u32;
+        let pc_b = pc_a.wrapping_add(0x0004_0000u32); // (>> 2) differs by 2^16
+        let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
+        let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
+        cache.lookup(pc_a, raw_addi, 1);
+        // Different pc but same slot → miss + refill.
+        cache.lookup(pc_b, raw_lwz, 1);
+        // First pc comes back → miss + refill (slot was taken by pc_b).
+        let back = cache.lookup(pc_a, raw_addi, 1);
+        assert_eq!(back.opcode, PpcOpcode::addi);
+        assert_eq!(cache.misses(), 3);
+    }
+
     #[test]
     fn test_decode_branch() {
         // b +0x100 => opcode 18, LI=0x40 (shifted left 2 = 0x100), AA=0, LK=0
@@ -816,4 +1005,103 @@ mod tests {
         assert_eq!(extract_bits(0x8000_0000, 0, 0), 1);
         assert_eq!(extract_bits(0x0000_0001, 31, 31), 1);
     }
+
+    // VMX128 register-name extraction. Locks the canonical bit positions
+    // (decoder.rs is the single source of truth — the analysis crate's
+    // old `ppc.rs` had different positions, which produced wrong printed
+    // register names; the bug was silent because the interpreter never
+    // used those extractors). Each test poke-bits exactly the slots the
+    // accessor reads and asserts the assembled register number.
+
+    fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32,
+                       vb16_20: u32, vb28: u32, vb30: u32) -> u32 {
+        // PPC bit i -> LSB position 31-i.
+        (vd6_10 << (31 - 10))
+            | (va21  << (31 - 21))   // va128 high bit at PPC 29 in some forms — kept 0 here
+            | (vd21  << (31 - 21))
+            | (vd22  << (31 - 22))
+            | (vb16_20 << (31 - 20))
+            | (vb28  << (31 - 28))
+            | (vb30  << (31 - 30))
+    }
+
+    #[test]
+    fn vmx128_vd128_low_5_bits_only() {
+        // vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo
+        for r in 0..32u32 {
+            let raw = (r as u32) << (31 - 10);
+            let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+            assert_eq!(d.vd128(), r as usize, "vd_lo={r}");
+        }
+    }
+
+    #[test]
+    fn vmx128_vd128_bit21_adds_32() {
+        // vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32
+        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 32);
+    }
+
+    #[test]
+    fn vmx128_vd128_bit22_adds_64() {
+        // vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64
+        let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 64);
+    }
+
+    #[test]
+    fn vmx128_vd128_full_127() {
+        // vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127
+        let raw = (31u32 << (31 - 10))
+            | (1u32 << (31 - 21))
+            | (1u32 << (31 - 22));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 127);
+    }
+
+    #[test]
+    fn vmx128_va128_uses_bit29() {
+        // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
+        let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.va128(), 39);
+    }
+
+    #[test]
+    fn vmx128_vb128_uses_bits28_and_30() {
+        // vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64.
+        let raw = (5u32 << (31 - 20))
+            | (1u32 << (31 - 28))
+            | (1u32 << (31 - 30));
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vb128(), 5 | 32 | 64);
+    }
+
+    #[test]
+    fn vmx128_vs128_aliases_vd128() {
+        // vs128 must always equal vd128.
+        for r in [0u32, 31, 32, 64, 96, 127] {
+            let lo = r & 0x1F;
+            let b21 = (r >> 5) & 1;
+            let b22 = (r >> 6) & 1;
+            let raw = (lo << (31 - 10))
+                | (b21 << (31 - 21))
+                | (b22 << (31 - 22));
+            let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+            assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}");
+            assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}");
+            assert_eq!(d.vd128(), d.vs128());
+        }
+    }
+
+    #[test]
+    #[allow(dead_code)]
+    fn _vmx128_test_word_helper_compiles() {
+        // Keep the helper validated against the real accessor.
+        let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0);
+        let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
+        assert_eq!(d.vd128(), 5 | 32 | 64);
+    }
 }
diff --git a/crates/xenia-cpu/src/disasm.rs b/crates/xenia-cpu/src/disasm.rs
index e4ee2ca..6edbce1 100644
--- a/crates/xenia-cpu/src/disasm.rs
+++ b/crates/xenia-cpu/src/disasm.rs
@@ -1,233 +1,627 @@
-use crate::decoder::DecodedInstr;
+//! PowerPC (Xbox 360 Xenon) text disassembler.
+//!
+//! Single source of truth for assembly text formatting. Sits on top of the
+//! canonical decoder in [`crate::decoder`] and consumes [`DecodedInstr`]
+//! (8-byte `Copy`, no allocations) so the interpreter's decode cache stays
+//! lean — formatting allocates, but only when a sink calls [`format`].
+//!
+//! [`format`] returns a [`DisasmText`] carrying both base and extended
+//! (simplified) mnemonic forms. Callers (text printer, JSON sink, DuckDB
+//! row writer) consume the fields directly instead of re-parsing.
+
+use crate::decoder::{DecodedInstr, extract_vx128_uimm5};
 use crate::opcode::PpcOpcode;
-use std::fmt::Write;
+
+/// Formatted disassembly of a single instruction.
+///
+/// Owns its strings. `mnemonic`/`operands` are the structured base form
+/// (e.g. `"addi"`, `"r3, r1, 16"`); `disasm` is the legacy padded display
+/// form (e.g. `"addi    r3, r1, 16"`). The `ext_*` triple is `Some` when
+/// a simplified/extended mnemonic applies (e.g. `addi r3,0,imm` →
+/// `li r3, imm`). `branch_target` is the resolved absolute target for
+/// direct branches (`b`/`bl`/`bc`/`bcl`); `None` for indirect branches
+/// and non-branches.
+#[derive(Debug, Clone)]
+pub struct DisasmText {
+    pub mnemonic: String,
+    pub operands: String,
+    pub disasm: String,
+    pub ext_mnemonic: Option<String>,
+    pub ext_operands: Option<String>,
+    pub ext_disasm: Option<String>,
+    pub branch_target: Option<u32>,
+}
+
+impl DisasmText {
+    /// Preferred display form: extended if present, else base.
+    #[inline]
+    pub fn display(&self) -> &str {
+        self.ext_disasm.as_deref().unwrap_or(&self.disasm)
+    }
+}
+
+// ── Internal builders ───────────────────────────────────────────────────────
+
+#[inline]
+fn pad_into(mnem: &str, operands: &str, width: usize) -> String {
+    if width <= mnem.len() + 1 {
+        // No padding fits — fall back to single-space join.
+        if operands.is_empty() { mnem.to_string() }
+        else { format!("{mnem} {operands}") }
+    } else {
+        format!("{:<width$}{}", mnem, operands, width = width)
+    }
+}
+
+fn base(mnem: &str, operands: String, pad: usize) -> DisasmText {
+    let disasm = pad_into(mnem, &operands, pad);
+    DisasmText {
+        mnemonic: mnem.to_string(),
+        operands,
+        disasm,
+        ext_mnemonic: None,
+        ext_operands: None,
+        ext_disasm: None,
+        branch_target: None,
+    }
+}
+
+fn with_ext(
+    base_mnem: &str, base_ops: String, base_pad: usize,
+    ext_mnem: &str, ext_ops: String, ext_pad: usize,
+) -> DisasmText {
+    let disasm = pad_into(base_mnem, &base_ops, base_pad);
+    let ext_disasm = pad_into(ext_mnem, &ext_ops, ext_pad);
+    DisasmText {
+        mnemonic: base_mnem.to_string(),
+        operands: base_ops,
+        disasm,
+        ext_mnemonic: Some(ext_mnem.to_string()),
+        ext_operands: Some(ext_ops),
+        ext_disasm: Some(ext_disasm),
+        branch_target: None,
+    }
+}
+
+fn with_target(mut t: DisasmText, target: u32) -> DisasmText {
+    t.branch_target = Some(target);
+    t
+}
+
+fn long_word(raw: u32) -> DisasmText {
+    let operands = format!("0x{raw:08X}");
+    base(".long", operands, 8)
+}
+
+// ── Helpers (register names, sign extension, condition decoding) ────────────
+
+#[inline] fn gpr(r: usize) -> String { format!("r{r}") }
+#[inline] fn fpr(r: usize) -> String { format!("f{r}") }
+#[inline] fn vr(r: usize) -> String { format!("v{r}") }
+
+fn crb(b: u32) -> String {
+    let cr = b / 4;
+    let bit = b % 4;
+    let bit_name = ["lt", "gt", "eq", "so"][bit as usize];
+    if cr == 0 { bit_name.to_string() } else { format!("4*cr{cr}+{bit_name}") }
+}
+
+fn spr_name(spr: u32) -> String {
+    match spr {
+        1 => "XER".into(),
+        8 => "LR".into(),
+        9 => "CTR".into(),
+        _ => format!("spr{spr}"),
+    }
+}
+
+#[inline] fn sign_ext(val: u32, bits: u32) -> i32 {
+    let shift = 32 - bits;
+    ((val << shift) as i32) >> shift
+}
+
+/// Map trap TO field to condition suffix (e.g. 16 → "lt", 4 → "eq").
+/// Unsigned variants (`lgt`/`llt`/`lge`/`lle`) cover bits 1-3 of the TO
+/// encoding which `tw`/`td` use for logical-compare conditions.
+fn trap_cond(to: u32) -> Option<&'static str> {
+    match to {
+        1  => Some("lgt"),
+        2  => Some("llt"),
+        4  => Some("eq"),
+        5  => Some("lge"),
+        6  => Some("lle"),
+        8  => Some("gt"),
+        12 => Some("ge"),
+        16 => Some("lt"),
+        20 => Some("le"),
+        24 => Some("ne"),
+        31 => Some(""),  // unconditional
+        _  => None,
+    }
+}
+
+/// For non-decrementing conditional branches: returns Some((cond_name, cr_prefix))
+/// where cr_prefix is e.g. "" or "cr2, ".
+fn cond_branch_ext(bo: u32, bi: u32) -> Option<(&'static str, String)> {
+    let cond_true = bo & 0x08 != 0;
+    let no_cond = bo & 0x10 != 0;
+    let decr = bo & 0x04 == 0;
+    if no_cond || decr { return None; }
+
+    let cr_field = bi / 4;
+    let cr_bit = bi % 4;
+    let cond_name = match (cr_bit, cond_true) {
+        (0, true) => "lt", (0, false) => "ge",
+        (1, true) => "gt", (1, false) => "le",
+        (2, true) => "eq", (2, false) => "ne",
+        (3, true) => "so", (3, false) => "ns",
+        _ => return None,
+    };
+    let cr = if cr_field == 0 { String::new() } else { format!("cr{cr_field}, ") };
+    Some((cond_name, cr))
+}
+
+#[inline] fn rc_dot(instr: &DecodedInstr) -> &'static str {
+    if instr.rc_bit() { "." } else { "" }
+}
+
+// ── Public entrypoints ──────────────────────────────────────────────────────
+
+/// Format a decoded instruction into structured disassembly text.
+pub fn format(instr: &DecodedInstr) -> DisasmText {
+    match instr.opcode {
+        // ── Branch ──────────────────────────────────────────────────────────
+        PpcOpcode::bx     => fmt_b(instr),
+        PpcOpcode::bcx    => fmt_bc(instr),
+        PpcOpcode::bclrx  => fmt_bclr(instr),
+        PpcOpcode::bcctrx => fmt_bcctr(instr),
+        PpcOpcode::sc     => base("sc", String::new(), 0),
+
+        // ── Trap ────────────────────────────────────────────────────────────
+        PpcOpcode::tdi => fmt_trap_imm(instr, "tdi", "td"),
+        PpcOpcode::twi => fmt_trap_imm(instr, "twi", "tw"),
+        PpcOpcode::td  => fmt_trap_reg(instr, "td"),
+        PpcOpcode::tw  => fmt_trap_reg(instr, "tw"),
+
+        // ── D-form ALU/logical ──────────────────────────────────────────────
+        PpcOpcode::addi    => fmt_addi(instr),
+        PpcOpcode::addis   => fmt_addis(instr),
+        PpcOpcode::addic   => fmt_d_add(instr, "addic"),
+        PpcOpcode::addicx  => fmt_d_add(instr, "addic."),
+        PpcOpcode::subficx => fmt_d_imm_simple(instr, "subfic"),
+        PpcOpcode::mulli   => fmt_d_imm_simple(instr, "mulli"),
+        PpcOpcode::cmpi    => fmt_cmp_imm(instr, "cmpi", true),
+        PpcOpcode::cmpli   => fmt_cmp_imm(instr, "cmpli", false),
+        PpcOpcode::ori     => fmt_ori(instr),
+        PpcOpcode::oris    => fmt_d_logic(instr, "oris"),
+        PpcOpcode::xori    => fmt_d_logic(instr, "xori"),
+        PpcOpcode::xoris   => fmt_d_logic(instr, "xoris"),
+        PpcOpcode::andix   => fmt_d_logic(instr, "andi."),
+        PpcOpcode::andisx  => fmt_d_logic(instr, "andis."),
+
+        // ── D-form load/store ───────────────────────────────────────────────
+        PpcOpcode::lwz   => fmt_ld(instr, "lwz",   false),
+        PpcOpcode::lwzu  => fmt_ld(instr, "lwzu",  false),
+        PpcOpcode::lbz   => fmt_ld(instr, "lbz",   false),
+        PpcOpcode::lbzu  => fmt_ld(instr, "lbzu",  false),
+        PpcOpcode::lhz   => fmt_ld(instr, "lhz",   false),
+        PpcOpcode::lhzu  => fmt_ld(instr, "lhzu",  false),
+        PpcOpcode::lha   => fmt_ld(instr, "lha",   false),
+        PpcOpcode::lhau  => fmt_ld(instr, "lhau",  false),
+        PpcOpcode::lmw   => fmt_ld(instr, "lmw",   false),
+        PpcOpcode::lfs   => fmt_ld(instr, "lfs",   true),
+        PpcOpcode::lfsu  => fmt_ld(instr, "lfsu",  true),
+        PpcOpcode::lfd   => fmt_ld(instr, "lfd",   true),
+        PpcOpcode::lfdu  => fmt_ld(instr, "lfdu",  true),
+        PpcOpcode::stw   => fmt_st(instr, "stw",   false),
+        PpcOpcode::stwu  => fmt_st(instr, "stwu",  false),
+        PpcOpcode::stb   => fmt_st(instr, "stb",   false),
+        PpcOpcode::stbu  => fmt_st(instr, "stbu",  false),
+        PpcOpcode::sth   => fmt_st(instr, "sth",   false),
+        PpcOpcode::sthu  => fmt_st(instr, "sthu",  false),
+        PpcOpcode::stmw  => fmt_st(instr, "stmw",  false),
+        PpcOpcode::stfs  => fmt_st(instr, "stfs",  true),
+        PpcOpcode::stfsu => fmt_st(instr, "stfsu", true),
+        PpcOpcode::stfd  => fmt_st(instr, "stfd",  true),
+        PpcOpcode::stfdu => fmt_st(instr, "stfdu", true),
+
+        // ── DS-form load/store ──────────────────────────────────────────────
+        PpcOpcode::ld   => fmt_ds(instr, "ld"),
+        PpcOpcode::ldu  => fmt_ds(instr, "ldu"),
+        PpcOpcode::lwa  => fmt_ds(instr, "lwa"),
+        PpcOpcode::std  => fmt_ds(instr, "std"),
+        PpcOpcode::stdu => fmt_ds(instr, "stdu"),
+
+        // ── Rotate ─────────────────────────────────────────────────────────
+        PpcOpcode::rlwimix => fmt_rlwimi(instr),
+        PpcOpcode::rlwinmx => fmt_rlwinm(instr),
+        PpcOpcode::rlwnmx  => fmt_rlwnm(instr),
+        PpcOpcode::rldiclx => fmt_rldicl(instr),
+        PpcOpcode::rldicrx => fmt_rldicr(instr),
+        PpcOpcode::rldicx  => fmt_rldic(instr),
+        PpcOpcode::rldimix => fmt_rldimi(instr),
+        PpcOpcode::rldclx  => fmt_rldcl(instr),
+        PpcOpcode::rldcrx  => fmt_rldcr(instr),
+
+        // ── Compare (X-form) ───────────────────────────────────────────────
+        PpcOpcode::cmp  => fmt_cmp_reg(instr, "cmp"),
+        PpcOpcode::cmpl => fmt_cmp_reg(instr, "cmpl"),
+
+        // ── X-form ALU (3-register) with OE/Rc ─────────────────────────────
+        PpcOpcode::addx    => fmt_xo_3op(instr, "add"),
+        PpcOpcode::addcx   => fmt_xo_3op(instr, "addc"),
+        PpcOpcode::addex   => fmt_xo_3op(instr, "adde"),
+        PpcOpcode::addmex  => fmt_xo_2op(instr, "addme"),
+        PpcOpcode::addzex  => fmt_xo_2op(instr, "addze"),
+        PpcOpcode::subfx   => fmt_subf(instr, "subf", "sub"),
+        PpcOpcode::subfcx  => fmt_subf(instr, "subfc", "subc"),
+        PpcOpcode::subfex  => fmt_xo_3op(instr, "subfe"),
+        PpcOpcode::subfmex => fmt_xo_2op(instr, "subfme"),
+        PpcOpcode::subfzex => fmt_xo_2op(instr, "subfze"),
+        PpcOpcode::negx    => fmt_xo_2op(instr, "neg"),
+        PpcOpcode::mullwx  => fmt_xo_3op(instr, "mullw"),
+        PpcOpcode::mulhwx  => fmt_xo_3op_no_oe(instr, "mulhw"),
+        PpcOpcode::mulhwux => fmt_xo_3op_rc_only(instr, "mulhwu"),
+        PpcOpcode::divwx   => fmt_xo_3op(instr, "divw"),
+        PpcOpcode::divwux  => fmt_xo_3op(instr, "divwu"),
+        PpcOpcode::mulldx  => fmt_xo_3op(instr, "mulld"),
+        PpcOpcode::mulhdx  => fmt_xo_3op_rc_only(instr, "mulhd"),
+        PpcOpcode::mulhdux => fmt_xo_3op_rc_only(instr, "mulhdu"),
+        PpcOpcode::divdx   => fmt_xo_3op(instr, "divd"),
+        PpcOpcode::divdux  => fmt_xo_3op(instr, "divdu"),
+
+        // ── X-form logical (Rc) ────────────────────────────────────────────
+        PpcOpcode::andx   => fmt_logic_and(instr),
+        PpcOpcode::andcx  => fmt_x_logic(instr, "andc"),
+        PpcOpcode::orx    => fmt_logic_or(instr),
+        PpcOpcode::orcx   => fmt_x_logic(instr, "orc"),
+        PpcOpcode::xorx   => fmt_x_logic(instr, "xor"),
+        PpcOpcode::norx   => fmt_logic_nor(instr),
+        PpcOpcode::nandx  => fmt_x_logic(instr, "nand"),
+        PpcOpcode::eqvx   => fmt_x_logic(instr, "eqv"),
+        PpcOpcode::extsbx => fmt_x_unary_rc(instr, "extsb"),
+        PpcOpcode::extshx => fmt_x_unary_rc(instr, "extsh"),
+        PpcOpcode::extswx => fmt_x_unary_rc(instr, "extsw"),
+        PpcOpcode::cntlzwx => fmt_x_unary_rc(instr, "cntlzw"),
+        PpcOpcode::cntlzdx => fmt_x_unary_rc(instr, "cntlzd"),
+
+        // ── Shift (32 / 64) ─────────────────────────────────────────────────
+        PpcOpcode::slwx    => fmt_x_logic(instr, "slw"),
+        PpcOpcode::srwx    => fmt_x_logic(instr, "srw"),
+        PpcOpcode::srawx   => fmt_x_logic(instr, "sraw"),
+        PpcOpcode::sldx    => fmt_x_logic(instr, "sld"),
+        PpcOpcode::srdx    => fmt_x_logic(instr, "srd"),
+        PpcOpcode::sradx   => fmt_x_logic(instr, "srad"),
+        PpcOpcode::srawix  => fmt_srawi(instr),
+        PpcOpcode::sradix  => fmt_sradi(instr),
+
+        // ── Special register moves ─────────────────────────────────────────
+        PpcOpcode::mfspr => fmt_mfspr(instr),
+        PpcOpcode::mtspr => fmt_mtspr(instr),
+        PpcOpcode::mfcr  => base("mfcr", gpr(instr.rd()), 8),
+        PpcOpcode::mtcrf => fmt_mtcrf(instr),
+        PpcOpcode::mfmsr => base("mfmsr", gpr(instr.rd()), 8),
+        PpcOpcode::mtmsr => base("mtmsr", gpr(instr.rs()), 8),
+        PpcOpcode::mtmsrd => base("mtmsrd", gpr(instr.rs()), 8),
+        PpcOpcode::mftb  => fmt_mftb(instr),
+        PpcOpcode::mcrxr => base("mcrxr", format!("cr{}", instr.crfd()), 8),
+        PpcOpcode::mcrf  => base("mcrf", format!("cr{}, cr{}", instr.crfd(), instr.crfs()), 8),
+
+        // ── X-form indexed load/store ──────────────────────────────────────
+        PpcOpcode::lwzx   => fmt_x_load(instr, "lwzx",   false),
+        PpcOpcode::lwzux  => fmt_x_load(instr, "lwzux",  false),
+        PpcOpcode::lbzx   => fmt_x_load(instr, "lbzx",   false),
+        PpcOpcode::lbzux  => fmt_x_load(instr, "lbzux",  false),
+        PpcOpcode::lhzx   => fmt_x_load(instr, "lhzx",   false),
+        PpcOpcode::lhzux  => fmt_x_load(instr, "lhzux",  false),
+        PpcOpcode::lhax   => fmt_x_load(instr, "lhax",   false),
+        PpcOpcode::lhaux  => fmt_x_load(instr, "lhaux",  false),
+        PpcOpcode::lwax   => fmt_x_load(instr, "lwax",   false),
+        PpcOpcode::lwaux  => fmt_x_load(instr, "lwaux",  false),
+        PpcOpcode::ldx    => fmt_x_load(instr, "ldx",    false),
+        PpcOpcode::ldux   => fmt_x_load(instr, "ldux",   false),
+        PpcOpcode::lwbrx  => fmt_x_load(instr, "lwbrx",  false),
+        PpcOpcode::lhbrx  => fmt_x_load(instr, "lhbrx",  false),
+        PpcOpcode::ldbrx  => fmt_x_load(instr, "ldbrx",  false),
+        PpcOpcode::lwarx  => fmt_x_load(instr, "lwarx",  false),
+        PpcOpcode::ldarx  => fmt_x_load(instr, "ldarx",  false),
+        PpcOpcode::lswx   => fmt_x_load(instr, "lswx",   false),
+        PpcOpcode::lswi   => fmt_lswi_stswi(instr, "lswi"),
+        PpcOpcode::lfsx   => fmt_x_load(instr, "lfsx",   true),
+        PpcOpcode::lfsux  => fmt_x_load(instr, "lfsux",  true),
+        PpcOpcode::lfdx   => fmt_x_load(instr, "lfdx",   true),
+        PpcOpcode::lfdux  => fmt_x_load(instr, "lfdux",  true),
+        PpcOpcode::stwx   => fmt_x_store(instr, "stwx",   false),
+        PpcOpcode::stwux  => fmt_x_store(instr, "stwux",  false),
+        PpcOpcode::stbx   => fmt_x_store(instr, "stbx",   false),
+        PpcOpcode::stbux  => fmt_x_store(instr, "stbux",  false),
+        PpcOpcode::sthx   => fmt_x_store(instr, "sthx",   false),
+        PpcOpcode::sthux  => fmt_x_store(instr, "sthux",  false),
+        PpcOpcode::stdx   => fmt_x_store(instr, "stdx",   false),
+        PpcOpcode::stdux  => fmt_x_store(instr, "stdux",  false),
+        PpcOpcode::stwbrx => fmt_x_store(instr, "stwbrx", false),
+        PpcOpcode::sthbrx => fmt_x_store(instr, "sthbrx", false),
+        PpcOpcode::stdbrx => fmt_x_store(instr, "stdbrx", false),
+        PpcOpcode::stwcx  => fmt_x_store(instr, "stwcx.", false),
+        PpcOpcode::stdcx  => fmt_x_store(instr, "stdcx.", false),
+        PpcOpcode::stswx  => fmt_x_store(instr, "stswx",  false),
+        PpcOpcode::stswi  => fmt_lswi_stswi(instr, "stswi"),
+        PpcOpcode::stfsx  => fmt_x_store(instr, "stfsx",  true),
+        PpcOpcode::stfsux => fmt_x_store(instr, "stfsux", true),
+        PpcOpcode::stfdx  => fmt_x_store(instr, "stfdx",  true),
+        PpcOpcode::stfdux => fmt_x_store(instr, "stfdux", true),
+        PpcOpcode::stfiwx => fmt_x_store(instr, "stfiwx", true),
+
+        // ── Cache / sync ────────────────────────────────────────────────────
+        PpcOpcode::dcbf  => fmt_cache(instr, "dcbf"),
+        PpcOpcode::dcbi  => fmt_cache(instr, "dcbi"),
+        PpcOpcode::dcbst => fmt_cache(instr, "dcbst"),
+        PpcOpcode::dcbt  => fmt_cache(instr, "dcbt"),
+        PpcOpcode::dcbtst => fmt_cache(instr, "dcbtst"),
+        PpcOpcode::dcbz  => fmt_cache(instr, "dcbz"),
+        PpcOpcode::dcbz128 => fmt_cache(instr, "dcbz128"),
+        PpcOpcode::icbi  => fmt_cache(instr, "icbi"),
+        PpcOpcode::sync  => base("sync", String::new(), 0),
+        PpcOpcode::eieio => base("eieio", String::new(), 0),
+        PpcOpcode::isync => base("isync", String::new(), 0),
+
+        // ── CR logical ──────────────────────────────────────────────────────
+        PpcOpcode::crand   => fmt_cr_logic(instr, "crand"),
+        PpcOpcode::crandc  => fmt_cr_logic(instr, "crandc"),
+        PpcOpcode::creqv   => fmt_creqv(instr),
+        PpcOpcode::crnand  => fmt_cr_logic(instr, "crnand"),
+        PpcOpcode::crnor   => fmt_crnor(instr),
+        PpcOpcode::cror    => fmt_cror(instr),
+        PpcOpcode::crorc   => fmt_cr_logic(instr, "crorc"),
+        PpcOpcode::crxor   => fmt_crxor(instr),
+
+        // ── FPU (op59 / op63) ──────────────────────────────────────────────
+        PpcOpcode::fdivsx   => fmt_a_3op(instr, "fdivs",  false),
+        PpcOpcode::fsubsx   => fmt_a_3op(instr, "fsubs",  false),
+        PpcOpcode::faddsx   => fmt_a_3op(instr, "fadds",  false),
+        PpcOpcode::fsqrtsx  => fmt_a_unary(instr, "fsqrts"),
+        PpcOpcode::fresx    => fmt_a_unary(instr, "fres"),
+        PpcOpcode::fmulsx   => fmt_a_3op(instr, "fmuls",  true),
+        PpcOpcode::fmsubsx  => fmt_a_4op(instr, "fmsubs"),
+        PpcOpcode::fmaddsx  => fmt_a_4op(instr, "fmadds"),
+        PpcOpcode::fnmsubsx => fmt_a_4op(instr, "fnmsubs"),
+        PpcOpcode::fnmaddsx => fmt_a_4op(instr, "fnmadds"),
+
+        PpcOpcode::fdivx    => fmt_a_3op(instr, "fdiv",   false),
+        PpcOpcode::fsubx    => fmt_a_3op(instr, "fsub",   false),
+        PpcOpcode::faddx    => fmt_a_3op(instr, "fadd",   false),
+        PpcOpcode::fsqrtx   => fmt_a_unary(instr, "fsqrt"),
+        PpcOpcode::fselx    => fmt_a_4op(instr, "fsel"),
+        PpcOpcode::fmulx    => fmt_a_3op(instr, "fmul",   true),
+        PpcOpcode::frsqrtex => fmt_a_unary(instr, "frsqrte"),
+        PpcOpcode::fmsubx   => fmt_a_4op(instr, "fmsub"),
+        PpcOpcode::fmaddx   => fmt_a_4op(instr, "fmadd"),
+        PpcOpcode::fnmsubx  => fmt_a_4op(instr, "fnmsub"),
+        PpcOpcode::fnmaddx  => fmt_a_4op(instr, "fnmadd"),
+
+        PpcOpcode::fcmpu    => fmt_fcmp(instr, "fcmpu"),
+        PpcOpcode::fcmpo    => fmt_fcmp(instr, "fcmpo"),
+        PpcOpcode::frspx    => fmt_x_fpu_unary(instr, "frsp"),
+        PpcOpcode::fctiwx   => fmt_x_fpu_unary(instr, "fctiw"),
+        PpcOpcode::fctiwzx  => fmt_x_fpu_unary(instr, "fctiwz"),
+        PpcOpcode::fnegx    => fmt_x_fpu_unary(instr, "fneg"),
+        PpcOpcode::fmrx     => fmt_x_fpu_unary(instr, "fmr"),
+        PpcOpcode::fnabsx   => fmt_x_fpu_unary(instr, "fnabs"),
+        PpcOpcode::fabsx    => fmt_x_fpu_unary(instr, "fabs"),
+        PpcOpcode::fctidx   => fmt_x_fpu_unary(instr, "fctid"),
+        PpcOpcode::fctidzx  => fmt_x_fpu_unary(instr, "fctidz"),
+        PpcOpcode::fcfidx   => fmt_x_fpu_unary(instr, "fcfid"),
+        PpcOpcode::mffsx    => {
+            let rc = rc_dot(instr);
+            base(&format!("mffs{rc}"), fpr(instr.rd()), 8)
+        }
+        PpcOpcode::mtfsfx   => {
+            let rc = rc_dot(instr);
+            let fxm = (instr.raw >> 17) & 0xFF;
+            let frb = (instr.raw >> 11) & 0x1F;
+            base(&format!("mtfsf{rc}"), format!("0x{fxm:02X}, {}", fpr(frb as usize)), 8)
+        }
+        PpcOpcode::mtfsb1x  => fmt_mtfsb(instr, "mtfsb1"),
+        PpcOpcode::mtfsb0x  => fmt_mtfsb(instr, "mtfsb0"),
+        PpcOpcode::mtfsfix  => {
+            let rc = rc_dot(instr);
+            let bf = instr.crfd();
+            let imm = (instr.raw >> 12) & 0xF;
+            base(&format!("mtfsfi{rc}"), format!("cr{bf}, {imm}"), 8)
+        }
+        PpcOpcode::mcrfs    => base("mcrfs", format!("cr{}, cr{}", instr.crfd(), instr.crfs()), 8),
+
+        // ── Standard VMX (5-bit registers) ────────────────────────────────
+        // 3-operand VD, VA, VB
+        PpcOpcode::vaddubm | PpcOpcode::vmaxub  | PpcOpcode::vrlb     | PpcOpcode::vmuloub  |
+        PpcOpcode::vaddfp  | PpcOpcode::vmrghb  | PpcOpcode::vpkuhum  |
+        PpcOpcode::vadduhm | PpcOpcode::vmaxuh  | PpcOpcode::vrlh     | PpcOpcode::vmulouh  |
+        PpcOpcode::vsubfp  | PpcOpcode::vmrghh  | PpcOpcode::vpkuwum  |
+        PpcOpcode::vadduwm | PpcOpcode::vmaxuw  | PpcOpcode::vrlw     | PpcOpcode::vmrghw   |
+        PpcOpcode::vpkuhus | PpcOpcode::vpkuwus |
+        PpcOpcode::vmaxsb  | PpcOpcode::vslb    | PpcOpcode::vmulosb  | PpcOpcode::vmrglb   |
+        PpcOpcode::vpkshus | PpcOpcode::vmaxsh  | PpcOpcode::vslh     | PpcOpcode::vmulosh  |
+        PpcOpcode::vmrglh  | PpcOpcode::vpkswus | PpcOpcode::vaddcuw  | PpcOpcode::vmaxsw   |
+        PpcOpcode::vslw    | PpcOpcode::vmrglw  | PpcOpcode::vpkshss  | PpcOpcode::vsl      |
+        PpcOpcode::vpkswss | PpcOpcode::vaddubs | PpcOpcode::vminub   | PpcOpcode::vsrb     |
+        PpcOpcode::vmuleub | PpcOpcode::vadduhs | PpcOpcode::vminuh   | PpcOpcode::vsrh     |
+        PpcOpcode::vmuleuh | PpcOpcode::vadduws | PpcOpcode::vminuw   | PpcOpcode::vsrw     |
+        PpcOpcode::vsr     | PpcOpcode::vaddsbs | PpcOpcode::vminsb   | PpcOpcode::vsrab    |
+        PpcOpcode::vmulesb | PpcOpcode::vpkpx   | PpcOpcode::vaddshs  | PpcOpcode::vminsh   |
+        PpcOpcode::vsrah   | PpcOpcode::vmulesh | PpcOpcode::vaddsws  | PpcOpcode::vminsw   |
+        PpcOpcode::vsraw   | PpcOpcode::vsububm | PpcOpcode::vavgub   | PpcOpcode::vand     |
+        PpcOpcode::vmaxfp  | PpcOpcode::vslo    | PpcOpcode::vsubuhm  | PpcOpcode::vavguh   |
+        PpcOpcode::vandc   | PpcOpcode::vminfp  | PpcOpcode::vsro     | PpcOpcode::vsubuwm  |
+        PpcOpcode::vavguw  | PpcOpcode::vor     | PpcOpcode::vxor     | PpcOpcode::vavgsb   |
+        PpcOpcode::vnor    | PpcOpcode::vavgsh  | PpcOpcode::vsubcuw  | PpcOpcode::vavgsw   |
+        PpcOpcode::vsububs | PpcOpcode::vsum4ubs| PpcOpcode::vsubuhs  | PpcOpcode::vsum4shs |
+        PpcOpcode::vsubuws | PpcOpcode::vsum2sws| PpcOpcode::vsubsbs  | PpcOpcode::vsum4sbs |
+        PpcOpcode::vsubshs | PpcOpcode::vsubsws | PpcOpcode::vsumsws  => {
+            fmt_vmx_3op(instr, opcode_name(instr.opcode))
+        }
+
+        // VMX unary VD, VB
+        PpcOpcode::vrefp   | PpcOpcode::vrsqrtefp | PpcOpcode::vexptefp |
+        PpcOpcode::vlogefp | PpcOpcode::vrfin     | PpcOpcode::vrfiz    |
+        PpcOpcode::vrfip   | PpcOpcode::vrfim     | PpcOpcode::vupkhsb  |
+        PpcOpcode::vupkhsh | PpcOpcode::vupklsb   | PpcOpcode::vupklsh  |
+        PpcOpcode::vupkhpx | PpcOpcode::vupklpx => {
+            fmt_vmx_unary(instr, opcode_name(instr.opcode))
+        }
+
+        // VMX VD, VB, UIMM (VA = uimm field)
+        PpcOpcode::vspltb  | PpcOpcode::vsplth | PpcOpcode::vspltw |
+        PpcOpcode::vcfux   | PpcOpcode::vcfsx  |
+        PpcOpcode::vctuxs  | PpcOpcode::vctsxs => {
+            fmt_vmx_uimm(instr, opcode_name(instr.opcode))
+        }
+
+        // VMX VD, SIMM (VA field as 5-bit signed immediate)
+        PpcOpcode::vspltisb => fmt_vmx_simm(instr, "vspltisb"),
+        PpcOpcode::vspltish => fmt_vmx_simm(instr, "vspltish"),
+        PpcOpcode::vspltisw => fmt_vmx_simm(instr, "vspltisw"),
+
+        PpcOpcode::mfvscr => base("mfvscr", vr(instr.rd()), 8),
+        PpcOpcode::mtvscr => base("mtvscr", vr(instr.rb()), 8),
+
+        // VMX compare (Rc bit at bit 21)
+        PpcOpcode::vcmpequb | PpcOpcode::vcmpequh | PpcOpcode::vcmpequw |
+        PpcOpcode::vcmpeqfp | PpcOpcode::vcmpgefp | PpcOpcode::vcmpgtub |
+        PpcOpcode::vcmpgtuh | PpcOpcode::vcmpgtuw | PpcOpcode::vcmpgtfp |
+        PpcOpcode::vcmpgtsb | PpcOpcode::vcmpgtsh | PpcOpcode::vcmpgtsw |
+        PpcOpcode::vcmpbfp => fmt_vmx_cmp(instr, opcode_name(instr.opcode)),
+
+        // VMX 4-operand VD, VA, VB, VC
+        PpcOpcode::vmhaddshs | PpcOpcode::vmhraddshs | PpcOpcode::vmladduhm |
+        PpcOpcode::vmsumubm  | PpcOpcode::vmsummbm   | PpcOpcode::vmsumuhm  |
+        PpcOpcode::vmsumuhs  | PpcOpcode::vmsumshm   | PpcOpcode::vmsumshs  |
+        PpcOpcode::vsel      | PpcOpcode::vperm => {
+            fmt_vmx_4op(instr, opcode_name(instr.opcode))
+        }
+
+        PpcOpcode::vsldoi   => fmt_vsldoi(instr),
+        PpcOpcode::vmaddfp  => fmt_vmx_4op_swap(instr, "vmaddfp"),
+        PpcOpcode::vnmsubfp => fmt_vmx_4op_swap(instr, "vnmsubfp"),
+
+        // ── VMX128 load/store (uses GPR addressing + vd128 dest) ───────────
+        PpcOpcode::lvsl128   => fmt_vmx128_ls(instr, "lvsl128"),
+        PpcOpcode::lvsr128   => fmt_vmx128_ls(instr, "lvsr128"),
+        PpcOpcode::lvewx128  => fmt_vmx128_ls(instr, "lvewx128"),
+        PpcOpcode::lvx128    => fmt_vmx128_ls(instr, "lvx128"),
+        PpcOpcode::lvxl128   => fmt_vmx128_ls(instr, "lvxl128"),
+        PpcOpcode::lvlx128   => fmt_vmx128_ls(instr, "lvlx128"),
+        PpcOpcode::lvrx128   => fmt_vmx128_ls(instr, "lvrx128"),
+        PpcOpcode::lvlxl128  => fmt_vmx128_ls(instr, "lvlxl128"),
+        PpcOpcode::lvrxl128  => fmt_vmx128_ls(instr, "lvrxl128"),
+        PpcOpcode::stvewx128 => fmt_vmx128_ls(instr, "stvewx128"),
+        PpcOpcode::stvx128   => fmt_vmx128_ls(instr, "stvx128"),
+        PpcOpcode::stvxl128  => fmt_vmx128_ls(instr, "stvxl128"),
+        PpcOpcode::stvlx128  => fmt_vmx128_ls(instr, "stvlx128"),
+        PpcOpcode::stvrx128  => fmt_vmx128_ls(instr, "stvrx128"),
+        PpcOpcode::stvlxl128 => fmt_vmx128_ls(instr, "stvlxl128"),
+        PpcOpcode::stvrxl128 => fmt_vmx128_ls(instr, "stvrxl128"),
+
+        // Standard AltiVec load/store indexed (5-bit vr0-vr31)
+        PpcOpcode::lvsl    => fmt_vmx_ls(instr, "lvsl"),
+        PpcOpcode::lvsr    => fmt_vmx_ls(instr, "lvsr"),
+        PpcOpcode::lvebx   => fmt_vmx_ls(instr, "lvebx"),
+        PpcOpcode::lvehx   => fmt_vmx_ls(instr, "lvehx"),
+        PpcOpcode::lvewx   => fmt_vmx_ls(instr, "lvewx"),
+        PpcOpcode::lvx     => fmt_vmx_ls(instr, "lvx"),
+        PpcOpcode::lvxl    => fmt_vmx_ls(instr, "lvxl"),
+        PpcOpcode::lvlx    => fmt_vmx_ls(instr, "lvlx"),
+        PpcOpcode::lvrx    => fmt_vmx_ls(instr, "lvrx"),
+        PpcOpcode::lvlxl   => fmt_vmx_ls(instr, "lvlxl"),
+        PpcOpcode::lvrxl   => fmt_vmx_ls(instr, "lvrxl"),
+        PpcOpcode::stvebx  => fmt_vmx_ls(instr, "stvebx"),
+        PpcOpcode::stvehx  => fmt_vmx_ls(instr, "stvehx"),
+        PpcOpcode::stvewx  => fmt_vmx_ls(instr, "stvewx"),
+        PpcOpcode::stvx    => fmt_vmx_ls(instr, "stvx"),
+        PpcOpcode::stvxl   => fmt_vmx_ls(instr, "stvxl"),
+        PpcOpcode::stvlx   => fmt_vmx_ls(instr, "stvlx"),
+        PpcOpcode::stvrx   => fmt_vmx_ls(instr, "stvrx"),
+        PpcOpcode::stvlxl  => fmt_vmx_ls(instr, "stvlxl"),
+        PpcOpcode::stvrxl  => fmt_vmx_ls(instr, "stvrxl"),
+
+        // ── VMX128 op5 (3-op and 4-op fp/pack/logic) ───────────────────────
+        PpcOpcode::vaddfp128   => fmt_vmx128_3op(instr, "vaddfp128"),
+        PpcOpcode::vsubfp128   => fmt_vmx128_3op(instr, "vsubfp128"),
+        PpcOpcode::vmulfp128   => fmt_vmx128_3op(instr, "vmulfp128"),
+        PpcOpcode::vmsum3fp128 => fmt_vmx128_3op(instr, "vmsum3fp128"),
+        PpcOpcode::vmsum4fp128 => fmt_vmx128_3op(instr, "vmsum4fp128"),
+        PpcOpcode::vpkshss128  => fmt_vmx128_3op(instr, "vpkshss128"),
+        PpcOpcode::vpkshus128  => fmt_vmx128_3op(instr, "vpkshus128"),
+        PpcOpcode::vpkswss128  => fmt_vmx128_3op(instr, "vpkswss128"),
+        PpcOpcode::vpkswus128  => fmt_vmx128_3op(instr, "vpkswus128"),
+        PpcOpcode::vpkuhum128  => fmt_vmx128_3op(instr, "vpkuhum128"),
+        PpcOpcode::vpkuhus128  => fmt_vmx128_3op(instr, "vpkuhus128"),
+        PpcOpcode::vpkuwum128  => fmt_vmx128_3op(instr, "vpkuwum128"),
+        PpcOpcode::vpkuwus128  => fmt_vmx128_3op(instr, "vpkuwus128"),
+        PpcOpcode::vand128     => fmt_vmx128_3op(instr, "vand128"),
+        PpcOpcode::vandc128    => fmt_vmx128_3op(instr, "vandc128"),
+        PpcOpcode::vnor128     => fmt_vmx128_3op(instr, "vnor128"),
+        PpcOpcode::vor128      => fmt_vmx128_3op(instr, "vor128"),
+        PpcOpcode::vxor128     => fmt_vmx128_3op(instr, "vxor128"),
+        PpcOpcode::vsel128     => fmt_vmx128_3op(instr, "vsel128"),
+        PpcOpcode::vslo128     => fmt_vmx128_3op(instr, "vslo128"),
+        PpcOpcode::vsro128     => fmt_vmx128_3op(instr, "vsro128"),
+
+        PpcOpcode::vmaddfp128   => fmt_vmaddfp128(instr),
+        PpcOpcode::vmaddcfp128  => fmt_vmx128_madd_vd_vb(instr, "vmaddcfp128"),
+        PpcOpcode::vnmsubfp128  => fmt_vmx128_madd_vd_vb(instr, "vnmsubfp128"),
+
+        PpcOpcode::vperm128    => fmt_vperm128(instr),
+        PpcOpcode::vsldoi128   => fmt_vsldoi128(instr),
+        PpcOpcode::vpermwi128  => fmt_vpermwi128(instr),
+
+        // ── VMX128 op6 special ─────────────────────────────────────────────
+        PpcOpcode::vpkd3d128   => fmt_vmx128_pack_d3d(instr, "vpkd3d128"),
+        PpcOpcode::vrlimi128   => fmt_vmx128_pack_d3d(instr, "vrlimi128"),
+        PpcOpcode::vrfim128    => fmt_vmx128_unary(instr, "vrfim128"),
+        PpcOpcode::vrfin128    => fmt_vmx128_unary(instr, "vrfin128"),
+        PpcOpcode::vrfip128    => fmt_vmx128_unary(instr, "vrfip128"),
+        PpcOpcode::vrfiz128    => fmt_vmx128_unary(instr, "vrfiz128"),
+        PpcOpcode::vrefp128    => fmt_vmx128_unary(instr, "vrefp128"),
+        PpcOpcode::vrsqrtefp128 => fmt_vmx128_unary(instr, "vrsqrtefp128"),
+        PpcOpcode::vexptefp128 => fmt_vmx128_unary(instr, "vexptefp128"),
+        PpcOpcode::vlogefp128  => fmt_vmx128_unary(instr, "vlogefp128"),
+        PpcOpcode::vcfpsxws128 => fmt_vmx128_uimm(instr, "vcfpsxws128"),
+        PpcOpcode::vcfpuxws128 => fmt_vmx128_uimm(instr, "vcfpuxws128"),
+        PpcOpcode::vcsxwfp128  => fmt_vmx128_uimm(instr, "vcsxwfp128"),
+        PpcOpcode::vcuxwfp128  => fmt_vmx128_uimm(instr, "vcuxwfp128"),
+        PpcOpcode::vspltw128   => fmt_vmx128_uimm(instr, "vspltw128"),
+        PpcOpcode::vupkd3d128  => fmt_vmx128_uimm(instr, "vupkd3d128"),
+        PpcOpcode::vspltisw128 => {
+            let vd = instr.vd128();
+            let simm = sign_ext(extract_vx128_uimm5(instr.raw), 5);
+            base("vspltisw128", format!("{}, {simm}", vr(vd)), 14)
+        }
+        PpcOpcode::vcmpeqfp128 => fmt_vmx128_cmp(instr, "vcmpeqfp128"),
+        PpcOpcode::vcmpgefp128 => fmt_vmx128_cmp(instr, "vcmpgefp128"),
+        PpcOpcode::vcmpgtfp128 => fmt_vmx128_cmp(instr, "vcmpgtfp128"),
+        PpcOpcode::vcmpbfp128  => fmt_vmx128_cmp(instr, "vcmpbfp128"),
+        PpcOpcode::vcmpequw128 => fmt_vmx128_cmp(instr, "vcmpequw128"),
+        PpcOpcode::vrlw128     => fmt_vmx128_3op(instr, "vrlw128"),
+        PpcOpcode::vslw128     => fmt_vmx128_3op(instr, "vslw128"),
+        PpcOpcode::vsraw128    => fmt_vmx128_3op(instr, "vsraw128"),
+        PpcOpcode::vsrw128     => fmt_vmx128_3op(instr, "vsrw128"),
+        PpcOpcode::vmaxfp128   => fmt_vmx128_3op(instr, "vmaxfp128"),
+        PpcOpcode::vminfp128   => fmt_vmx128_3op(instr, "vminfp128"),
+        PpcOpcode::vmrghw128   => fmt_vmx128_3op(instr, "vmrghw128"),
+        PpcOpcode::vmrglw128   => fmt_vmx128_3op(instr, "vmrglw128"),
+        PpcOpcode::vupkhsb128  => fmt_vmx128_3op(instr, "vupkhsb128"),
+        PpcOpcode::vupklsb128  => fmt_vmx128_3op(instr, "vupklsb128"),
+
+        PpcOpcode::Invalid => long_word(instr.raw),
+    }
+}
 
 /// Disassemble a decoded instruction into PPC assembly text.
+///
+/// Back-compat entry point: returns the same single-string the legacy
+/// formatter produced, preferring the extended form when present.
 pub fn disassemble(instr: &DecodedInstr) -> String {
-    let mut out = String::new();
-    match instr.opcode {
-        // Branch instructions
-        PpcOpcode::bx => {
-            let target = if instr.aa() {
-                instr.li() as u32
-            } else {
-                instr.addr.wrapping_add(instr.li() as u32)
-            };
-            let mnemonic = if instr.lk() { "bl" } else { "b" };
-            write!(out, "{} 0x{:08X}", mnemonic, target).unwrap();
-        }
-        PpcOpcode::bcx => {
-            let bo = instr.bo();
-            let bi = instr.bi();
-            let target = if instr.aa() {
-                instr.bd() as u32
-            } else {
-                instr.addr.wrapping_add(instr.bd() as u32)
-            };
-            let mnemonic = if instr.lk() { "bcl" } else { "bc" };
-            write!(out, "{} {},{},0x{:08X}", mnemonic, bo, bi, target).unwrap();
-        }
-        PpcOpcode::bclrx => {
-            let mnemonic = if instr.lk() { "bclrl" } else { "bclr" };
-            write!(out, "{} {},{}", mnemonic, instr.bo(), instr.bi()).unwrap();
-        }
-        PpcOpcode::bcctrx => {
-            let mnemonic = if instr.lk() { "bcctrl" } else { "bcctr" };
-            write!(out, "{} {},{}", mnemonic, instr.bo(), instr.bi()).unwrap();
-        }
-
-        // System call
-        PpcOpcode::sc => {
-            write!(out, "sc").unwrap();
-        }
-
-        // D-form load/store
-        PpcOpcode::lwz | PpcOpcode::lwzu | PpcOpcode::lbz | PpcOpcode::lbzu |
-        PpcOpcode::lhz | PpcOpcode::lhzu | PpcOpcode::lha | PpcOpcode::lhau |
-        PpcOpcode::lfs | PpcOpcode::lfsu | PpcOpcode::lfd | PpcOpcode::lfdu => {
-            write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rd(), instr.d(), instr.ra()).unwrap();
-        }
-        PpcOpcode::stw | PpcOpcode::stwu | PpcOpcode::stb | PpcOpcode::stbu |
-        PpcOpcode::sth | PpcOpcode::sthu |
-        PpcOpcode::stfs | PpcOpcode::stfsu | PpcOpcode::stfd | PpcOpcode::stfdu => {
-            write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rs(), instr.d(), instr.ra()).unwrap();
-        }
-
-        // D-form immediate ALU
-        PpcOpcode::addi | PpcOpcode::addis | PpcOpcode::addic | PpcOpcode::addicx |
-        PpcOpcode::subficx | PpcOpcode::mulli => {
-            write!(out, "{:?} r{},r{},{}", instr.opcode, instr.rd(), instr.ra(), instr.simm16()).unwrap();
-        }
-
-        // D-form immediate logical
-        PpcOpcode::ori | PpcOpcode::oris | PpcOpcode::xori | PpcOpcode::xoris |
-        PpcOpcode::andix | PpcOpcode::andisx => {
-            write!(out, "{:?} r{},r{},0x{:04X}", instr.opcode, instr.ra(), instr.rs(), instr.uimm16()).unwrap();
-        }
-
-        // Compare
-        PpcOpcode::cmpi => {
-            write!(out, "cmp{}i cr{},r{},{}", if instr.l() { "d" } else { "w" },
-                instr.crfd(), instr.ra(), instr.simm16()).unwrap();
-        }
-        PpcOpcode::cmpli => {
-            write!(out, "cmpl{}i cr{},r{},0x{:04X}", if instr.l() { "d" } else { "w" },
-                instr.crfd(), instr.ra(), instr.uimm16()).unwrap();
-        }
-        PpcOpcode::cmp => {
-            write!(out, "cmp{} cr{},r{},r{}", if instr.l() { "d" } else { "w" },
-                instr.crfd(), instr.ra(), instr.rb()).unwrap();
-        }
-        PpcOpcode::cmpl => {
-            write!(out, "cmpl{} cr{},r{},r{}", if instr.l() { "d" } else { "w" },
-                instr.crfd(), instr.ra(), instr.rb()).unwrap();
-        }
-
-        // X-form ALU (3-register)
-        PpcOpcode::addx | PpcOpcode::addcx | PpcOpcode::addex | PpcOpcode::addzex |
-        PpcOpcode::addmex | PpcOpcode::subfx | PpcOpcode::subfcx | PpcOpcode::subfex |
-        PpcOpcode::subfzex | PpcOpcode::subfmex | PpcOpcode::negx |
-        PpcOpcode::mullwx | PpcOpcode::mulhwx | PpcOpcode::mulhwux |
-        PpcOpcode::divwx | PpcOpcode::divwux |
-        PpcOpcode::mulldx | PpcOpcode::mulhdx | PpcOpcode::mulhdux |
-        PpcOpcode::divdx | PpcOpcode::divdux => {
-            write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.rd(), instr.ra(), instr.rb()).unwrap();
-        }
-
-        // X-form logical
-        PpcOpcode::andx | PpcOpcode::andcx | PpcOpcode::orx | PpcOpcode::orcx |
-        PpcOpcode::xorx | PpcOpcode::norx | PpcOpcode::nandx | PpcOpcode::eqvx => {
-            write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.ra(), instr.rs(), instr.rb()).unwrap();
-        }
-
-        // Shift/rotate
-        PpcOpcode::slwx | PpcOpcode::srwx | PpcOpcode::srawx | PpcOpcode::sldx |
-        PpcOpcode::srdx | PpcOpcode::sradx => {
-            write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.ra(), instr.rs(), instr.rb()).unwrap();
-        }
-        PpcOpcode::srawix => {
-            write!(out, "srawi r{},r{},{}", instr.ra(), instr.rs(), instr.sh()).unwrap();
-        }
-        PpcOpcode::sradix => {
-            write!(out, "sradi r{},r{},{}", instr.ra(), instr.rs(), instr.sh64()).unwrap();
-        }
-
-        // Rotate
-        PpcOpcode::rlwinmx => {
-            write!(out, "rlwinm r{},r{},{},{},{}", instr.ra(), instr.rs(), instr.sh(), instr.mb(), instr.me()).unwrap();
-        }
-        PpcOpcode::rlwimix => {
-            write!(out, "rlwimi r{},r{},{},{},{}", instr.ra(), instr.rs(), instr.sh(), instr.mb(), instr.me()).unwrap();
-        }
-        PpcOpcode::rlwnmx => {
-            write!(out, "rlwnm r{},r{},r{},{},{}", instr.ra(), instr.rs(), instr.rb(), instr.mb(), instr.me()).unwrap();
-        }
-
-        // Special register moves
-        PpcOpcode::mfspr => {
-            let spr_name = match instr.spr() {
-                1 => "xer",
-                8 => "lr",
-                9 => "ctr",
-                268 => "tbl",
-                269 => "tbu",
-                _ => "",
-            };
-            if spr_name.is_empty() {
-                write!(out, "mfspr r{},{}", instr.rd(), instr.spr()).unwrap();
-            } else {
-                write!(out, "mf{} r{}", spr_name, instr.rd()).unwrap();
-            }
-        }
-        PpcOpcode::mtspr => {
-            let spr_name = match instr.spr() {
-                1 => "xer",
-                8 => "lr",
-                9 => "ctr",
-                _ => "",
-            };
-            if spr_name.is_empty() {
-                write!(out, "mtspr {},r{}", instr.spr(), instr.rs()).unwrap();
-            } else {
-                write!(out, "mt{} r{}", spr_name, instr.rs()).unwrap();
-            }
-        }
-        PpcOpcode::mfcr => {
-            write!(out, "mfcr r{}", instr.rd()).unwrap();
-        }
-        PpcOpcode::mtcrf => {
-            write!(out, "mtcrf 0x{:02X},r{}", instr.crm(), instr.rs()).unwrap();
-        }
-
-        // Extend
-        PpcOpcode::extsbx => write!(out, "extsb r{},r{}", instr.ra(), instr.rs()).unwrap(),
-        PpcOpcode::extshx => write!(out, "extsh r{},r{}", instr.ra(), instr.rs()).unwrap(),
-        PpcOpcode::extswx => write!(out, "extsw r{},r{}", instr.ra(), instr.rs()).unwrap(),
-        PpcOpcode::cntlzwx => write!(out, "cntlzw r{},r{}", instr.ra(), instr.rs()).unwrap(),
-        PpcOpcode::cntlzdx => write!(out, "cntlzd r{},r{}", instr.ra(), instr.rs()).unwrap(),
-
-        // X-form load/store
-        PpcOpcode::lwzx | PpcOpcode::lwzux | PpcOpcode::lbzx | PpcOpcode::lbzux |
-        PpcOpcode::lhzx | PpcOpcode::lhzux | PpcOpcode::lhax | PpcOpcode::lhaux |
-        PpcOpcode::lwax | PpcOpcode::lwaux | PpcOpcode::ldx | PpcOpcode::ldux |
-        PpcOpcode::lfsx | PpcOpcode::lfsux | PpcOpcode::lfdx | PpcOpcode::lfdux |
-        PpcOpcode::lwbrx | PpcOpcode::lhbrx | PpcOpcode::ldbrx |
-        PpcOpcode::lwarx | PpcOpcode::ldarx => {
-            write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.rd(), instr.ra(), instr.rb()).unwrap();
-        }
-        PpcOpcode::stwx | PpcOpcode::stwux | PpcOpcode::stbx | PpcOpcode::stbux |
-        PpcOpcode::sthx | PpcOpcode::sthux | PpcOpcode::stdx | PpcOpcode::stdux |
-        PpcOpcode::stfsx | PpcOpcode::stfsux | PpcOpcode::stfdx | PpcOpcode::stfdux |
-        PpcOpcode::stwbrx | PpcOpcode::sthbrx | PpcOpcode::stdbrx |
-        PpcOpcode::stwcx | PpcOpcode::stdcx | PpcOpcode::stfiwx => {
-            write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.rs(), instr.ra(), instr.rb()).unwrap();
-        }
-
-        // Cache/sync ops (no-ops for interpreter)
-        PpcOpcode::dcbf | PpcOpcode::dcbi | PpcOpcode::dcbst |
-        PpcOpcode::dcbt | PpcOpcode::dcbtst | PpcOpcode::icbi => {
-            write!(out, "{:?} r{},r{}", instr.opcode, instr.ra(), instr.rb()).unwrap();
-        }
-        PpcOpcode::dcbz | PpcOpcode::dcbz128 => {
-            write!(out, "{:?} r{},r{}", instr.opcode, instr.ra(), instr.rb()).unwrap();
-        }
-        PpcOpcode::sync | PpcOpcode::eieio | PpcOpcode::isync => {
-            write!(out, "{:?}", instr.opcode).unwrap();
-        }
-
-        // Load/store multiple
-        PpcOpcode::lmw => write!(out, "lmw r{},{}(r{})", instr.rd(), instr.d(), instr.ra()).unwrap(),
-        PpcOpcode::stmw => write!(out, "stmw r{},{}(r{})", instr.rs(), instr.d(), instr.ra()).unwrap(),
-
-        // DS-form loads/stores
-        PpcOpcode::ld | PpcOpcode::ldu | PpcOpcode::lwa => {
-            write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rd(), instr.ds(), instr.ra()).unwrap();
-        }
-        PpcOpcode::std | PpcOpcode::stdu => {
-            write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rs(), instr.ds(), instr.ra()).unwrap();
-        }
-
-        // CR logical ops
-        PpcOpcode::crand | PpcOpcode::crandc | PpcOpcode::creqv | PpcOpcode::crnand |
-        PpcOpcode::crnor | PpcOpcode::cror | PpcOpcode::crorc | PpcOpcode::crxor => {
-            write!(out, "{:?} {},{},{}", instr.opcode, instr.crbd(), instr.crba(), instr.crbb()).unwrap();
-        }
-        PpcOpcode::mcrf => {
-            write!(out, "mcrf cr{},cr{}", instr.crfd(), instr.crfs()).unwrap();
-        }
-
-        // Trap
-        PpcOpcode::tdi => write!(out, "tdi {},r{},{}", instr.rd(), instr.ra(), instr.simm16()).unwrap(),
-        PpcOpcode::twi => write!(out, "twi {},r{},{}", instr.rd(), instr.ra(), instr.simm16()).unwrap(),
-        PpcOpcode::td => write!(out, "td {},r{},r{}", instr.rd(), instr.ra(), instr.rb()).unwrap(),
-        PpcOpcode::tw => write!(out, "tw {},r{},r{}", instr.rd(), instr.ra(), instr.rb()).unwrap(),
-
-        // Default: just print opcode and raw hex
-        _ => {
-            write!(out, "{:?} [{:08X}]", instr.opcode, instr.raw).unwrap();
-        }
-    }
-    out
+    format(instr).display().to_string()
 }
 
 /// Disassemble a range of instructions from a byte slice.
@@ -239,38 +633,1235 @@ pub fn disassemble_block(data: &[u8], base_addr: u32, count: usize) -> Vec<(u32,
             break;
         }
         let raw = u32::from_be_bytes([
-            data[offset],
-            data[offset + 1],
-            data[offset + 2],
-            data[offset + 3],
+            data[offset], data[offset + 1], data[offset + 2], data[offset + 3],
         ]);
         let addr = base_addr + offset as u32;
-        let instr = crate::decode(raw, addr);
+        let instr = crate::decoder::decode(raw, addr);
         let text = disassemble(&instr);
         result.push((addr, text));
     }
     result
 }
 
+/// One yielded instruction from [`iter_disasm`]. Carries the absolute VA,
+/// raw word, decoded opcode and the formatted text — everything a sink
+/// needs to render or persist a single row without re-parsing.
+#[derive(Debug, Clone)]
+pub struct DisasmItem {
+    pub addr: u32,
+    pub raw: u32,
+    pub opcode: PpcOpcode,
+    pub text: DisasmText,
+}
+
+/// Iterate over instructions in the VA range `[va_start, va_end)` of an
+/// image-mapped byte slice. `image[rva]` must hold the byte at absolute VA
+/// `image_base + rva` (the layout produced by [`xenia_xex::loader`]).
+///
+/// Stops on a truncated tail (less than 4 bytes remaining at the cursor).
+/// Yields nothing if `va_start >= va_end` or the start RVA is beyond the
+/// image.
+pub fn iter_disasm(
+    image: &[u8],
+    image_base: u32,
+    va_start: u32,
+    va_end: u32,
+) -> impl Iterator<Item = DisasmItem> + '_ {
+    DisasmIter { image, image_base, va: va_start, end: va_end }
+}
+
+struct DisasmIter<'a> {
+    image: &'a [u8],
+    image_base: u32,
+    va: u32,
+    end: u32,
+}
+
+impl Iterator for DisasmIter<'_> {
+    type Item = DisasmItem;
+    #[inline]
+    fn next(&mut self) -> Option<DisasmItem> {
+        if self.va >= self.end {
+            return None;
+        }
+        let rva = self.va.wrapping_sub(self.image_base) as usize;
+        if rva + 4 > self.image.len() {
+            return None;
+        }
+        let raw = u32::from_be_bytes([
+            self.image[rva],
+            self.image[rva + 1],
+            self.image[rva + 2],
+            self.image[rva + 3],
+        ]);
+        let abs = self.va;
+        let decoded = crate::decoder::decode(raw, abs);
+        let text = format(&decoded);
+        self.va = self.va.wrapping_add(4);
+        Some(DisasmItem { addr: abs, raw, opcode: decoded.opcode, text })
+    }
+}
+
+// ── Per-class formatters ───────────────────────────────────────────────────
+
+fn opcode_name(op: PpcOpcode) -> &'static str {
+    // Used for VMX where the enum variant name matches the canonical mnemonic.
+    // For ALU/FPU variants ending in "x", use hardcoded strings instead.
+    match op {
+        PpcOpcode::vaddubm => "vaddubm", PpcOpcode::vmaxub => "vmaxub", PpcOpcode::vrlb => "vrlb",
+        PpcOpcode::vmuloub => "vmuloub", PpcOpcode::vaddfp => "vaddfp", PpcOpcode::vmrghb => "vmrghb",
+        PpcOpcode::vpkuhum => "vpkuhum", PpcOpcode::vadduhm => "vadduhm", PpcOpcode::vmaxuh => "vmaxuh",
+        PpcOpcode::vrlh => "vrlh", PpcOpcode::vmulouh => "vmulouh", PpcOpcode::vsubfp => "vsubfp",
+        PpcOpcode::vmrghh => "vmrghh", PpcOpcode::vpkuwum => "vpkuwum",
+        PpcOpcode::vadduwm => "vadduwm", PpcOpcode::vmaxuw => "vmaxuw", PpcOpcode::vrlw => "vrlw",
+        PpcOpcode::vmrghw => "vmrghw", PpcOpcode::vpkuhus => "vpkuhus", PpcOpcode::vpkuwus => "vpkuwus",
+        PpcOpcode::vmaxsb => "vmaxsb", PpcOpcode::vslb => "vslb", PpcOpcode::vmulosb => "vmulosb",
+        PpcOpcode::vmrglb => "vmrglb", PpcOpcode::vpkshus => "vpkshus", PpcOpcode::vmaxsh => "vmaxsh",
+        PpcOpcode::vslh => "vslh", PpcOpcode::vmulosh => "vmulosh", PpcOpcode::vmrglh => "vmrglh",
+        PpcOpcode::vpkswus => "vpkswus", PpcOpcode::vaddcuw => "vaddcuw", PpcOpcode::vmaxsw => "vmaxsw",
+        PpcOpcode::vslw => "vslw", PpcOpcode::vmrglw => "vmrglw", PpcOpcode::vpkshss => "vpkshss",
+        PpcOpcode::vsl => "vsl", PpcOpcode::vpkswss => "vpkswss",
+        PpcOpcode::vaddubs => "vaddubs", PpcOpcode::vminub => "vminub", PpcOpcode::vsrb => "vsrb",
+        PpcOpcode::vmuleub => "vmuleub", PpcOpcode::vadduhs => "vadduhs", PpcOpcode::vminuh => "vminuh",
+        PpcOpcode::vsrh => "vsrh", PpcOpcode::vmuleuh => "vmuleuh",
+        PpcOpcode::vadduws => "vadduws", PpcOpcode::vminuw => "vminuw", PpcOpcode::vsrw => "vsrw",
+        PpcOpcode::vsr => "vsr",
+        PpcOpcode::vaddsbs => "vaddsbs", PpcOpcode::vminsb => "vminsb", PpcOpcode::vsrab => "vsrab",
+        PpcOpcode::vmulesb => "vmulesb", PpcOpcode::vpkpx => "vpkpx",
+        PpcOpcode::vaddshs => "vaddshs", PpcOpcode::vminsh => "vminsh", PpcOpcode::vsrah => "vsrah",
+        PpcOpcode::vmulesh => "vmulesh",
+        PpcOpcode::vaddsws => "vaddsws", PpcOpcode::vminsw => "vminsw", PpcOpcode::vsraw => "vsraw",
+        PpcOpcode::vsububm => "vsububm", PpcOpcode::vavgub => "vavgub", PpcOpcode::vand => "vand",
+        PpcOpcode::vmaxfp => "vmaxfp", PpcOpcode::vslo => "vslo",
+        PpcOpcode::vsubuhm => "vsubuhm", PpcOpcode::vavguh => "vavguh", PpcOpcode::vandc => "vandc",
+        PpcOpcode::vminfp => "vminfp", PpcOpcode::vsro => "vsro",
+        PpcOpcode::vsubuwm => "vsubuwm", PpcOpcode::vavguw => "vavguw", PpcOpcode::vor => "vor",
+        PpcOpcode::vxor => "vxor", PpcOpcode::vavgsb => "vavgsb", PpcOpcode::vnor => "vnor",
+        PpcOpcode::vavgsh => "vavgsh", PpcOpcode::vsubcuw => "vsubcuw", PpcOpcode::vavgsw => "vavgsw",
+        PpcOpcode::vsububs => "vsububs", PpcOpcode::vsum4ubs => "vsum4ubs",
+        PpcOpcode::vsubuhs => "vsubuhs", PpcOpcode::vsum4shs => "vsum4shs",
+        PpcOpcode::vsubuws => "vsubuws", PpcOpcode::vsum2sws => "vsum2sws",
+        PpcOpcode::vsubsbs => "vsubsbs", PpcOpcode::vsum4sbs => "vsum4sbs",
+        PpcOpcode::vsubshs => "vsubshs", PpcOpcode::vsubsws => "vsubsws",
+        PpcOpcode::vsumsws => "vsumsws",
+
+        PpcOpcode::vrefp => "vrefp", PpcOpcode::vrsqrtefp => "vrsqrtefp",
+        PpcOpcode::vexptefp => "vexptefp", PpcOpcode::vlogefp => "vlogefp",
+        PpcOpcode::vrfin => "vrfin", PpcOpcode::vrfiz => "vrfiz",
+        PpcOpcode::vrfip => "vrfip", PpcOpcode::vrfim => "vrfim",
+        PpcOpcode::vupkhsb => "vupkhsb", PpcOpcode::vupkhsh => "vupkhsh",
+        PpcOpcode::vupklsb => "vupklsb", PpcOpcode::vupklsh => "vupklsh",
+        PpcOpcode::vupkhpx => "vupkhpx", PpcOpcode::vupklpx => "vupklpx",
+
+        PpcOpcode::vspltb => "vspltb", PpcOpcode::vsplth => "vsplth", PpcOpcode::vspltw => "vspltw",
+        PpcOpcode::vcfux => "vcfux", PpcOpcode::vcfsx => "vcfsx",
+        PpcOpcode::vctuxs => "vctuxs", PpcOpcode::vctsxs => "vctsxs",
+
+        PpcOpcode::vcmpequb => "vcmpequb", PpcOpcode::vcmpequh => "vcmpequh",
+        PpcOpcode::vcmpequw => "vcmpequw", PpcOpcode::vcmpeqfp => "vcmpeqfp",
+        PpcOpcode::vcmpgefp => "vcmpgefp", PpcOpcode::vcmpgtub => "vcmpgtub",
+        PpcOpcode::vcmpgtuh => "vcmpgtuh", PpcOpcode::vcmpgtuw => "vcmpgtuw",
+        PpcOpcode::vcmpgtfp => "vcmpgtfp", PpcOpcode::vcmpgtsb => "vcmpgtsb",
+        PpcOpcode::vcmpgtsh => "vcmpgtsh", PpcOpcode::vcmpgtsw => "vcmpgtsw",
+        PpcOpcode::vcmpbfp => "vcmpbfp",
+
+        PpcOpcode::vmhaddshs => "vmhaddshs", PpcOpcode::vmhraddshs => "vmhraddshs",
+        PpcOpcode::vmladduhm => "vmladduhm",
+        PpcOpcode::vmsumubm => "vmsumubm", PpcOpcode::vmsummbm => "vmsummbm",
+        PpcOpcode::vmsumuhm => "vmsumuhm", PpcOpcode::vmsumuhs => "vmsumuhs",
+        PpcOpcode::vmsumshm => "vmsumshm", PpcOpcode::vmsumshs => "vmsumshs",
+        PpcOpcode::vsel => "vsel", PpcOpcode::vperm => "vperm",
+        _ => "?",
+    }
+}
+
+// Branches (I-form: b/bl/ba/bla) — produces base + extended forms.
+fn fmt_b(instr: &DecodedInstr) -> DisasmText {
+    let aa = instr.aa();
+    let lk = instr.lk();
+    let target = if aa { instr.li() as u32 }
+                 else  { instr.addr.wrapping_add(instr.li() as u32) };
+    let mnem = match (aa, lk) {
+        (false, false) => "b",
+        (false, true)  => "bl",
+        (true,  false) => "ba",
+        (true,  true)  => "bla",
+    };
+    let ops = format!("0x{target:08X}");
+    with_target(base(mnem, ops, 8), target)
+}
+
+fn fmt_bc(instr: &DecodedInstr) -> DisasmText {
+    let bo = instr.bo();
+    let bi = instr.bi();
+    let aa = instr.aa();
+    let lk = instr.lk();
+    let target = if aa { instr.bd() as u32 }
+                 else  { instr.addr.wrapping_add(instr.bd() as u32) };
+
+    let a = if aa { "a" } else { "" };
+    let l = if lk { "l" } else { "" };
+    let base_mnem = format!("bc{a}{l}");
+    let base_ops = format!("{bo}, {}, 0x{target:08X}", crb(bi));
+
+    // Extended forms.
+    let cr_field = bi / 4;
+    let cr_bit = bi % 4;
+    let decr = bo & 0x04 == 0;
+    let uncond = bo & 0x10 != 0;
+
+    let result = if uncond && !decr {
+        // Unconditional branch.
+        let ext_mnem = format!("b{a}{l}");
+        let ext_ops = format!("0x{target:08X}");
+        with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
+    } else {
+        let cond_true = bo & 0x08 != 0;
+        let cond_name_opt: Option<&'static str> = match (cr_bit, cond_true) {
+            (0, true) => Some("lt"), (0, false) => Some("ge"),
+            (1, true) => Some("gt"), (1, false) => Some("le"),
+            (2, true) => Some("eq"), (2, false) => Some("ne"),
+            (3, true) => Some("so"), (3, false) => Some("ns"),
+            _ => None,
+        };
+        let cr = if cr_field == 0 { String::new() } else { format!("cr{cr_field}, ") };
+
+        if decr {
+            let z = if bo & 0x02 != 0 { "z" } else { "nz" };
+            let cond_str = cond_name_opt.unwrap_or("");
+            let ext_mnem = format!("bd{z}{cond_str}{a}{l}");
+            let ext_ops = format!("{cr}0x{target:08X}");
+            with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
+        } else if let Some(cond_name) = cond_name_opt {
+            let ext_mnem = format!("b{cond_name}{a}{l}");
+            let ext_ops = format!("{cr}0x{target:08X}");
+            with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
+        } else {
+            base(&base_mnem, base_ops, 8)
+        }
+    };
+    with_target(result, target)
+}
+
+fn fmt_bclr(instr: &DecodedInstr) -> DisasmText {
+    let bo = instr.bo();
+    let bi = instr.bi();
+    let lk = instr.lk();
+    let l = if lk { "l" } else { "" };
+    let base_mnem = format!("bclr{l}");
+    let base_ops = format!("{bo}, {}", crb(bi));
+
+    // BO=20 (binary 10100) sets both "ignore CTR" and "ignore CR" bits, making
+    // the branch unconditional regardless of BI. BI is don't-care by spec, so
+    // the simplified `blr`/`blrl` form applies for any BI value.
+    if bo == 20 {
+        let ext = if lk { "blrl" } else { "blr" };
+        return with_ext(&base_mnem, base_ops, 8, ext, String::new(), 0);
+    }
+    if let Some((cond, cr)) = cond_branch_ext(bo, bi) {
+        let cr_no_comma = cr.trim_end_matches(", ");
+        let ext_mnem = format!("b{cond}lr{l}");
+        if cr_no_comma.is_empty() {
+            return with_ext(&base_mnem, base_ops, 8, &ext_mnem, String::new(), 0);
+        } else {
+            return with_ext(&base_mnem, base_ops, 8, &ext_mnem, cr_no_comma.to_string(), 8);
+        }
+    }
+    let decr = bo & 0x04 == 0;
+    let uncond = bo & 0x10 != 0;
+    if decr && uncond {
+        let z = if bo & 0x02 != 0 { "z" } else { "nz" };
+        let ext_mnem = format!("bd{z}lr{l}");
+        return with_ext(&base_mnem, base_ops, 8, &ext_mnem, String::new(), 0);
+    }
+    base(&base_mnem, base_ops, 8)
+}
+
+fn fmt_bcctr(instr: &DecodedInstr) -> DisasmText {
+    let bo = instr.bo();
+    let bi = instr.bi();
+    let lk = instr.lk();
+    let l = if lk { "l" } else { "" };
+    let base_mnem = format!("bcctr{l}");
+    let base_ops = format!("{bo}, {}", crb(bi));
+
+    // BO=20 unconditional pattern: BI is don't-care (see fmt_bclr).
+    if bo == 20 {
+        let ext = if lk { "bctrl" } else { "bctr" };
+        return with_ext(&base_mnem, base_ops, 8, ext, String::new(), 0);
+    }
+    if let Some((cond, cr)) = cond_branch_ext(bo, bi) {
+        let cr_no_comma = cr.trim_end_matches(", ");
+        let ext_mnem = format!("b{cond}ctr{l}");
+        if cr_no_comma.is_empty() {
+            return with_ext(&base_mnem, base_ops, 8, &ext_mnem, String::new(), 0);
+        } else {
+            return with_ext(&base_mnem, base_ops, 8, &ext_mnem, cr_no_comma.to_string(), 8);
+        }
+    }
+    base(&base_mnem, base_ops, 8)
+}
+
+// Trap immediate / register
+fn fmt_trap_imm(instr: &DecodedInstr, mnem: &str, simplified_prefix: &str) -> DisasmText {
+    let to = instr.to();
+    let ra = instr.ra();
+    let imm = instr.simm16() as i32;
+    let base_ops = format!("{to}, {}, {imm}", gpr(ra));
+    if let Some(cond) = trap_cond(to) {
+        if cond.is_empty() {
+            base(mnem, base_ops, 8)
+        } else {
+            let ext_mnem = format!("{simplified_prefix}{cond}i");
+            let ext_ops = format!("{}, {imm}", gpr(ra));
+            with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
+        }
+    } else {
+        base(mnem, base_ops, 8)
+    }
+}
+
+fn fmt_trap_reg(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let to = instr.to();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let base_ops = format!("{to}, {}, {}", gpr(ra), gpr(rb));
+    if to == 31 && ra == 0 && rb == 0 {
+        return with_ext(mnem, base_ops, 8, "trap", String::new(), 0);
+    }
+    if let Some(cond) = trap_cond(to)
+        && !cond.is_empty()
+    {
+        let ext_mnem = format!("{mnem}{cond}");
+        let ext_ops = format!("{}, {}", gpr(ra), gpr(rb));
+        return with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8);
+    }
+    base(mnem, base_ops, 8)
+}
+
+// D-form ALU
+fn fmt_addi(instr: &DecodedInstr) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let imm = instr.simm16() as i32;
+    let base_ops = format!("{}, {}, {imm}", gpr(rt), gpr(ra));
+    if ra == 0 {
+        with_ext("addi", base_ops, 8, "li", format!("{}, {imm}", gpr(rt)), 8)
+    } else if imm < 0 {
+        with_ext("addi", base_ops, 8, "subi", format!("{}, {}, {}", gpr(rt), gpr(ra), -imm), 8)
+    } else {
+        base("addi", base_ops, 8)
+    }
+}
+
+fn fmt_addis(instr: &DecodedInstr) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let imm = instr.simm16() as i32;
+    let imm_u = imm as u16 as u32;
+    let base_ops = format!("{}, {}, 0x{imm_u:X}", gpr(rt), gpr(ra));
+    if ra == 0 {
+        with_ext("addis", base_ops, 8, "lis", format!("{}, 0x{imm_u:X}", gpr(rt)), 8)
+    } else if imm < 0 {
+        let neg = (-imm) as u16 as u32;
+        with_ext("addis", base_ops, 8, "subis", format!("{}, {}, 0x{neg:X}", gpr(rt), gpr(ra)), 8)
+    } else {
+        base("addis", base_ops, 8)
+    }
+}
+
+fn fmt_d_add(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let imm = instr.simm16() as i32;
+    let base_ops = format!("{}, {}, {imm}", gpr(rt), gpr(ra));
+    if imm < 0 {
+        let ext_mnem = mnem.replace("addic", "subic");
+        with_ext(mnem, base_ops, 8, &ext_mnem, format!("{}, {}, {}", gpr(rt), gpr(ra), -imm), 8)
+    } else {
+        base(mnem, base_ops, 8)
+    }
+}
+
+fn fmt_d_imm_simple(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let imm = instr.simm16() as i32;
+    base(mnem, format!("{}, {}, {imm}", gpr(rt), gpr(ra)), 8)
+}
+
+fn fmt_cmp_imm(instr: &DecodedInstr, mnem: &str, signed: bool) -> DisasmText {
+    let bf = instr.crfd();
+    let l_bit = if instr.l() { 1 } else { 0 };
+    let ra = instr.ra();
+    let imm_str = if signed {
+        format!("{}", instr.simm16() as i32)
+    } else {
+        format!("0x{:X}", instr.uimm16())
+    };
+    let cr = if bf == 0 { String::new() } else { format!("cr{bf}, ") };
+    let base_ops = format!("{cr}{l_bit}, {}, {imm_str}", gpr(ra));
+
+    let size = if l_bit == 0 { "w" } else { "d" };
+    let ext_mnem = if mnem == "cmpi" {
+        format!("cmp{size}i")
+    } else {
+        format!("cmpl{size}i")
+    };
+    let ext_ops = format!("{cr}{}, {imm_str}", gpr(ra));
+    with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
+}
+
+fn fmt_cmp_reg(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let bf = instr.crfd();
+    let l_bit = if instr.l() { 1 } else { 0 };
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let cr = if bf == 0 { String::new() } else { format!("cr{bf}, ") };
+    let base_ops = format!("{cr}{l_bit}, {}, {}", gpr(ra), gpr(rb));
+    let size = if l_bit == 0 { "w" } else { "d" };
+    let ext_mnem = format!("{mnem}{size}");
+    let ext_ops = format!("{cr}{}, {}", gpr(ra), gpr(rb));
+    with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8)
+}
+
+fn fmt_ori(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let uimm = instr.uimm16() as u32;
+    let base_ops = format!("{}, {}, 0x{uimm:X}", gpr(ra), gpr(rs));
+    if rs == 0 && ra == 0 && uimm == 0 {
+        with_ext("ori", base_ops, 8, "nop", String::new(), 0)
+    } else {
+        base("ori", base_ops, 8)
+    }
+}
+
+fn fmt_d_logic(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let uimm = instr.uimm16() as u32;
+    base(mnem, format!("{}, {}, 0x{uimm:X}", gpr(ra), gpr(rs)), 8)
+}
+
+// D-form load/store. `is_fpr` selects between fX and rX for the data register.
+fn fmt_ld(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let d = instr.d();
+    let rn = if is_fpr { fpr(rt) } else { gpr(rt) };
+    base(mnem, format!("{rn}, {d}({})", gpr(ra)), 8)
+}
+
+fn fmt_st(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let d = instr.d();
+    let rn = if is_fpr { fpr(rs) } else { gpr(rs) };
+    base(mnem, format!("{rn}, {d}({})", gpr(ra)), 8)
+}
+
+// DS-form load/store.
+fn fmt_ds(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let r = instr.rd();
+    let ra = instr.ra();
+    let ds = instr.ds();
+    base(mnem, format!("{}, {ds}({})", gpr(r), gpr(ra)), 8)
+}
+
+// Rotate (32-bit).
+fn fmt_rlwimi(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let sh = instr.sh();
+    let mb = instr.mb();
+    let me = instr.me();
+    let rc = rc_dot(instr);
+    let mnem = format!("rlwimi{rc}");
+    let base_ops = format!("{}, {}, {sh}, {mb}, {me}", gpr(ra), gpr(rs));
+    // inslwi rA, rS, n, b = rlwimi rA, rS, 32-b, b, b+n-1
+    if mb <= me && sh == (32u32.wrapping_sub(mb)) % 32 && sh != 31u32.wrapping_sub(me) {
+        let n = me - mb + 1;
+        let b = mb;
+        let ext_mnem = format!("inslwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext_mnem, format!("{}, {}, {n}, {b}", gpr(ra), gpr(rs)), 8);
+    }
+    // insrwi rA, rS, n, b = rlwimi rA, rS, 32-(b+n), b, b+n-1
+    if mb <= me && sh == 31u32.wrapping_sub(me) % 32 {
+        let n = me - mb + 1;
+        let b = mb;
+        let ext_mnem = format!("insrwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext_mnem, format!("{}, {}, {n}, {b}", gpr(ra), gpr(rs)), 8);
+    }
+    base(&mnem, base_ops, 8)
+}
+
+fn fmt_rlwinm(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let sh = instr.sh();
+    let mb = instr.mb();
+    let me = instr.me();
+    let rc = rc_dot(instr);
+    let mnem = format!("rlwinm{rc}");
+    let base_ops = format!("{}, {}, {sh}, {mb}, {me}", gpr(ra), gpr(rs));
+
+    // Priority-ordered simplified forms.
+    if sh > 0 && mb == 0 && me == 31 - sh {
+        let ext = format!("slwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8);
+    }
+    if sh > 0 && me == 31 && sh + mb == 32 {
+        let ext = format!("srwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), 32 - sh), 8);
+    }
+    if sh > 0 && mb == 0 && me == 31 {
+        let ext = format!("rotlwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8);
+    }
+    if sh == 0 && me == 31 && mb > 0 {
+        let ext = format!("clrlwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {mb}", gpr(ra), gpr(rs)), 8);
+    }
+    if sh == 0 && mb == 0 && me < 31 {
+        let ext = format!("clrrwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), 31 - me), 8);
+    }
+    if mb == 0 && sh > 0 && me < 31 {
+        let n = me + 1;
+        let ext = format!("extlwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {n}, {sh}", gpr(ra), gpr(rs)), 8);
+    }
+    if me == 31 && mb > 0 && sh > 0 {
+        let n = 32 - mb;
+        let b = sh.wrapping_sub(n) % 32;
+        let ext = format!("extrwi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {n}, {b}", gpr(ra), gpr(rs)), 8);
+    }
+    base(&mnem, base_ops, 8)
+}
+
+fn fmt_rlwnm(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let mb = instr.mb();
+    let me = instr.me();
+    let rc = rc_dot(instr);
+    let mnem = format!("rlwnm{rc}");
+    let base_ops = format!("{}, {}, {}, {mb}, {me}", gpr(ra), gpr(rs), gpr(rb));
+    if mb == 0 && me == 31 {
+        let ext = format!("rotlw{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)), 8);
+    }
+    base(&mnem, base_ops, 8)
+}
+
+// 64-bit MD/MDS-form rotate.
+fn fmt_rldicl(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rc = rc_dot(instr);
+    let sh = instr.sh64();
+    let mb = mb_md(instr.raw);
+    let mnem = format!("rldicl{rc}");
+    let base_ops = format!("{}, {}, {sh}, {mb}", gpr(ra), gpr(rs));
+    if sh == 0 && mb > 0 {
+        let ext = format!("clrldi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {mb}", gpr(ra), gpr(rs)), 8);
+    }
+    if mb > 0 && sh == (64u32.wrapping_sub(mb)) & 63 {
+        let ext = format!("srdi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {mb}", gpr(ra), gpr(rs)), 8);
+    }
+    if sh > 0 && mb == 0 {
+        let ext = format!("rotldi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8);
+    }
+    base(&mnem, base_ops, 8)
+}
+
+fn fmt_rldicr(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rc = rc_dot(instr);
+    let sh = instr.sh64();
+    let me = mb_md(instr.raw);
+    let mnem = format!("rldicr{rc}");
+    let base_ops = format!("{}, {}, {sh}, {me}", gpr(ra), gpr(rs));
+    if sh > 0 && me == (63u32.wrapping_sub(sh)) & 63 {
+        let ext = format!("sldi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8);
+    }
+    if sh == 0 && me < 63 {
+        let ext = format!("clrrdi{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), 63 - me), 8);
+    }
+    base(&mnem, base_ops, 8)
+}
+
+fn fmt_rldic(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rc = rc_dot(instr);
+    let sh = instr.sh64();
+    let mb = mb_md(instr.raw);
+    base(&format!("rldic{rc}"), format!("{}, {}, {sh}, {mb}", gpr(ra), gpr(rs)), 8)
+}
+
+fn fmt_rldimi(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rc = rc_dot(instr);
+    let sh = instr.sh64();
+    let mb = mb_md(instr.raw);
+    let mnem = format!("rldimi{rc}");
+    let base_ops = format!("{}, {}, {sh}, {mb}", gpr(ra), gpr(rs));
+    if mb > 0 {
+        let n = (64u32.wrapping_sub(sh).wrapping_sub(mb)) & 63;
+        if n > 0 {
+            let ext = format!("insrdi{rc}");
+            return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {n}, {mb}", gpr(ra), gpr(rs)), 8);
+        }
+    }
+    base(&mnem, base_ops, 8)
+}
+
+fn fmt_rldcl(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let mb = mb_md(instr.raw);
+    let mnem = format!("rldcl{rc}");
+    let base_ops = format!("{}, {}, {}, {mb}", gpr(ra), gpr(rs), gpr(rb));
+    if mb == 0 {
+        let ext = format!("rotld{rc}");
+        return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)), 8);
+    }
+    base(&mnem, base_ops, 8)
+}
+
+fn fmt_rldcr(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let me = mb_md(instr.raw);
+    base(&format!("rldcr{rc}"), format!("{}, {}, {}, {me}", gpr(ra), gpr(rs), gpr(rb)), 8)
+}
+
+/// MD/MDS-form mb/me field: 6 bits packed as bits 21-25 + bit 26 (low bit).
+#[inline]
+fn mb_md(raw: u32) -> u32 {
+    let lo5 = (raw >> 6) & 0x1F;          // bits 21-25
+    let hi  = (raw >> 5) & 0x1;           // bit 26
+    lo5 | (hi << 5)
+}
+
+// XO-form ALU
+fn fmt_xo_3op(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let oe = if instr.oe() { "o" } else { "" };
+    let full = format!("{mnem}{oe}{rc}");
+    base(&full, format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb)), 8)
+}
+
+fn fmt_xo_2op(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let rc = rc_dot(instr);
+    let oe = if instr.oe() { "o" } else { "" };
+    let full = format!("{mnem}{oe}{rc}");
+    base(&full, format!("{}, {}", gpr(rt), gpr(ra)), 8)
+}
+
+fn fmt_xo_3op_no_oe(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let oe = if instr.oe() { "o" } else { "" };
+    let full = format!("{mnem}{oe}{rc}");
+    base(&full, format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb)), 8)
+}
+
+fn fmt_xo_3op_rc_only(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let full = format!("{mnem}{rc}");
+    base(&full, format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb)), 8)
+}
+
+fn fmt_subf(instr: &DecodedInstr, base_mnem: &str, ext_mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let oe = if instr.oe() { "o" } else { "" };
+    let bm = format!("{base_mnem}{oe}{rc}");
+    let em = format!("{ext_mnem}{oe}{rc}");
+    let bo = format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb));
+    let eo = format!("{}, {}, {}", gpr(rt), gpr(rb), gpr(ra));
+    with_ext(&bm, bo, 8, &em, eo, 8)
+}
+
+// X-form logical
+fn fmt_x_logic(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let full = format!("{mnem}{rc}");
+    base(&full, format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)), 8)
+}
+
+fn fmt_x_unary_rc(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rc = rc_dot(instr);
+    let full = format!("{mnem}{rc}");
+    base(&full, format!("{}, {}", gpr(ra), gpr(rs)), 8)
+}
+
+fn fmt_logic_and(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let bm = format!("and{rc}");
+    let bo = format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb));
+    if rs == rb {
+        let em = format!("mr{rc}");
+        with_ext(&bm, bo, 8, &em, format!("{}, {}", gpr(ra), gpr(rs)), 8)
+    } else {
+        base(&bm, bo, 8)
+    }
+}
+
+fn fmt_logic_or(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let bm = format!("or{rc}");
+    let bo = format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb));
+    if rs == rb {
+        let em = format!("mr{rc}");
+        with_ext(&bm, bo, 8, &em, format!("{}, {}", gpr(ra), gpr(rs)), 8)
+    } else {
+        base(&bm, bo, 8)
+    }
+}
+
+fn fmt_logic_nor(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rc = rc_dot(instr);
+    let bm = format!("nor{rc}");
+    let bo = format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb));
+    if rs == rb {
+        let em = format!("not{rc}");
+        with_ext(&bm, bo, 8, &em, format!("{}, {}", gpr(ra), gpr(rs)), 8)
+    } else {
+        base(&bm, bo, 8)
+    }
+}
+
+// Shift immediate
+fn fmt_srawi(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let sh = instr.sh();
+    let rc = rc_dot(instr);
+    base(&format!("srawi{rc}"), format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8)
+}
+
+fn fmt_sradi(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let sh = instr.sh64();
+    let rc = rc_dot(instr);
+    base(&format!("sradi{rc}"), format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8)
+}
+
+// Special-purpose register moves
+fn fmt_mfspr(instr: &DecodedInstr) -> DisasmText {
+    let rd = instr.rd();
+    let spr = instr.spr();
+    let base_ops = format!("{}, {}", gpr(rd), spr_name(spr));
+    let ext = match spr {
+        8 => Some(("mflr", format!("{}", gpr(rd)))),
+        9 => Some(("mfctr", format!("{}", gpr(rd)))),
+        1 => Some(("mfxer", format!("{}", gpr(rd)))),
+        _ => None,
+    };
+    match ext {
+        Some((em, eo)) => with_ext("mfspr", base_ops, 8, em, eo, 8),
+        None => base("mfspr", base_ops, 8),
+    }
+}
+
+fn fmt_mtspr(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let spr = instr.spr();
+    let base_ops = format!("{}, {}", spr_name(spr), gpr(rs));
+    let ext = match spr {
+        8 => Some(("mtlr", format!("{}", gpr(rs)))),
+        9 => Some(("mtctr", format!("{}", gpr(rs)))),
+        1 => Some(("mtxer", format!("{}", gpr(rs)))),
+        _ => None,
+    };
+    match ext {
+        Some((em, eo)) => with_ext("mtspr", base_ops, 8, em, eo, 8),
+        None => base("mtspr", base_ops, 8),
+    }
+}
+
+fn fmt_mtcrf(instr: &DecodedInstr) -> DisasmText {
+    let rs = instr.rs();
+    let fxm = (instr.raw >> 12) & 0xFF;
+    let bo = format!("0x{fxm:02X}, {}", gpr(rs));
+    if fxm == 0xFF {
+        with_ext("mtcrf", bo, 8, "mtcr", gpr(rs), 8)
+    } else {
+        base("mtcrf", bo, 8)
+    }
+}
+
+fn fmt_mftb(instr: &DecodedInstr) -> DisasmText {
+    let rd = instr.rd();
+    let tbr = instr.spr();
+    let base_ops = format!("{}, {tbr}", gpr(rd));
+    match tbr {
+        268 => with_ext("mftb", base_ops, 8, "mftb", gpr(rd), 8),
+        269 => with_ext("mftb", base_ops, 8, "mftbu", gpr(rd), 8),
+        _ => base("mftb", base_ops, 8),
+    }
+}
+
+// X-form indexed load/store.
+fn fmt_x_load(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rn = if is_fpr { fpr(rt) } else { gpr(rt) };
+    base(mnem, format!("{rn}, {}, {}", gpr(ra), gpr(rb)), 8)
+}
+
+fn fmt_x_store(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText {
+    let rs = instr.rs();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    let rn = if is_fpr { fpr(rs) } else { gpr(rs) };
+    base(mnem, format!("{rn}, {}, {}", gpr(ra), gpr(rb)), 8)
+}
+
+fn fmt_lswi_stswi(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let rt = instr.rd();
+    let ra = instr.ra();
+    let nb = instr.nb();
+    base(mnem, format!("{}, {}, {nb}", gpr(rt), gpr(ra)), 8)
+}
+
+fn fmt_cache(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let ra = instr.ra();
+    let rb = instr.rb();
+    base(mnem, format!("{}, {}", gpr(ra), gpr(rb)), 8)
+}
+
+// CR logical
+fn fmt_cr_logic(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let bt = instr.crbd();
+    let ba = instr.crba();
+    let bb = instr.crbb();
+    base(mnem, format!("{}, {}, {}", crb(bt), crb(ba), crb(bb)), 8)
+}
+
+fn fmt_crnor(instr: &DecodedInstr) -> DisasmText {
+    let bt = instr.crbd();
+    let ba = instr.crba();
+    let bb = instr.crbb();
+    let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb));
+    if ba == bb {
+        with_ext("crnor", bo, 8, "crnot", format!("{}, {}", crb(bt), crb(ba)), 8)
+    } else {
+        base("crnor", bo, 8)
+    }
+}
+
+fn fmt_crxor(instr: &DecodedInstr) -> DisasmText {
+    let bt = instr.crbd();
+    let ba = instr.crba();
+    let bb = instr.crbb();
+    let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb));
+    if bt == ba && ba == bb {
+        with_ext("crxor", bo, 8, "crclr", crb(bt), 8)
+    } else {
+        base("crxor", bo, 8)
+    }
+}
+
+fn fmt_creqv(instr: &DecodedInstr) -> DisasmText {
+    let bt = instr.crbd();
+    let ba = instr.crba();
+    let bb = instr.crbb();
+    let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb));
+    if bt == ba && ba == bb {
+        with_ext("creqv", bo, 8, "crset", crb(bt), 8)
+    } else {
+        base("creqv", bo, 8)
+    }
+}
+
+fn fmt_cror(instr: &DecodedInstr) -> DisasmText {
+    let bt = instr.crbd();
+    let ba = instr.crba();
+    let bb = instr.crbb();
+    let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb));
+    if ba == bb {
+        with_ext("cror", bo, 8, "crmove", format!("{}, {}", crb(bt), crb(ba)), 8)
+    } else {
+        base("cror", bo, 8)
+    }
+}
+
+// FPU
+fn fmt_a_3op(instr: &DecodedInstr, mnem: &str, use_frc: bool) -> DisasmText {
+    let frt = instr.rd();
+    let fra = instr.ra();
+    let frb = instr.rb();
+    let frc = instr.rc();
+    let rc = rc_dot(instr);
+    let full = format!("{mnem}{rc}");
+    let ops = if use_frc {
+        format!("{}, {}, {}", fpr(frt), fpr(fra), fpr(frc))
+    } else {
+        format!("{}, {}, {}", fpr(frt), fpr(fra), fpr(frb))
+    };
+    base(&full, ops, 8)
+}
+
+fn fmt_a_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let frt = instr.rd();
+    let frb = instr.rb();
+    let rc = rc_dot(instr);
+    base(&format!("{mnem}{rc}"), format!("{}, {}", fpr(frt), fpr(frb)), 8)
+}
+
+fn fmt_a_4op(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let frt = instr.rd();
+    let fra = instr.ra();
+    let frb = instr.rb();
+    let frc = instr.rc();
+    let rc = rc_dot(instr);
+    base(&format!("{mnem}{rc}"),
+         format!("{}, {}, {}, {}", fpr(frt), fpr(fra), fpr(frc), fpr(frb)), 8)
+}
+
+fn fmt_fcmp(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let bf = instr.crfd();
+    let fra = instr.ra();
+    let frb = instr.rb();
+    base(mnem, format!("cr{bf}, {}, {}", fpr(fra), fpr(frb)), 8)
+}
+
+fn fmt_x_fpu_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let frt = instr.rd();
+    let frb = instr.rb();
+    let rc = rc_dot(instr);
+    base(&format!("{mnem}{rc}"), format!("{}, {}", fpr(frt), fpr(frb)), 8)
+}
+
+fn fmt_mtfsb(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let bt = instr.crbd();
+    let rc = rc_dot(instr);
+    base(&format!("{mnem}{rc}"), format!("{bt}"), 8)
+}
+
+// VMX (5-bit registers).
+fn fmt_vmx_3op(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let va = instr.ra();
+    let vb = instr.rb();
+    base(mnem, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 8)
+}
+
+fn fmt_vmx_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let vb = instr.rb();
+    base(mnem, format!("{}, {}", vr(vd), vr(vb)), 8)
+}
+
+fn fmt_vmx_uimm(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let vb = instr.rb();
+    let uimm = instr.ra() as u32;
+    base(mnem, format!("{}, {}, {uimm}", vr(vd), vr(vb)), 8)
+}
+
+fn fmt_vmx_simm(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let simm = sign_ext(instr.ra() as u32, 5);
+    base(mnem, format!("{}, {simm}", vr(vd)), 9)
+}
+
+fn fmt_vmx_cmp(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let va = instr.ra();
+    let vb = instr.rb();
+    // Rc bit at position 22 (0-indexed from MSB)
+    let rc = if (instr.raw >> 10) & 1 != 0 { "." } else { "" };
+    let full = format!("{mnem}{rc}");
+    base(&full, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 12)
+}
+
+fn fmt_vmx_4op(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let va = instr.ra();
+    let vb = instr.rb();
+    let vc = instr.rc();
+    base(mnem, format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vb), vr(vc)), 12)
+}
+
+fn fmt_vmx_4op_swap(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let va = instr.ra();
+    let vb = instr.rb();
+    let vc = instr.rc();
+    base(mnem, format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vc), vr(vb)), 9)
+}
+
+fn fmt_vsldoi(instr: &DecodedInstr) -> DisasmText {
+    let vd = instr.rd();
+    let va = instr.ra();
+    let vb = instr.rb();
+    let sh = (instr.raw >> 6) & 0xF;
+    base("vsldoi", format!("{}, {}, {}, {sh}", vr(vd), vr(va), vr(vb)), 8)
+}
+
+fn fmt_vmx_ls(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.rd();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    base(mnem, format!("{}, {}, {}", vr(vd), gpr(ra), gpr(rb)), 8)
+}
+
+// VMX128 — uses canonical va128/vb128/vd128 accessors from decoder.rs.
+// (Silently fixes the prior ppc.rs bug where these used wrong bit positions.)
+fn fmt_vmx128_ls(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.vd128();
+    let ra = instr.ra();
+    let rb = instr.rb();
+    base(mnem, format!("{}, {}, {}", vr(vd), gpr(ra), gpr(rb)), 12)
+}
+
+fn fmt_vmx128_3op(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.vd128();
+    let va = instr.va128();
+    let vb = instr.vb128();
+    base(mnem, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 12)
+}
+
+// VMX128 multiply-add forms (VX128_2): the addend is the VD register
+// re-used, not a separate VC field. Operand order differs between
+// `vmaddfp128` (VD, VA, VB, VD) and the `vmaddcfp128`/`vnmsubfp128`
+// pair (VD, VA, VD, VB), per canary's authoritative formatters in
+// xenia-canary/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc.
+fn fmt_vmaddfp128(instr: &DecodedInstr) -> DisasmText {
+    let vd = instr.vd128();
+    let va = instr.va128();
+    let vb = instr.vb128();
+    base(
+        "vmaddfp128",
+        format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vb), vr(vd)),
+        12,
+    )
+}
+
+fn fmt_vmx128_madd_vd_vb(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.vd128();
+    let va = instr.va128();
+    let vb = instr.vb128();
+    base(
+        mnem,
+        format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vd), vr(vb)),
+        12,
+    )
+}
+
+fn fmt_vperm128(instr: &DecodedInstr) -> DisasmText {
+    let vd = instr.vd128();
+    let va = instr.va128();
+    let vb = instr.vb128();
+    let vc = (instr.raw >> 6) & 0x7;
+    base("vperm128", format!("{}, {}, {}, {vc}", vr(vd), vr(va), vr(vb)), 9)
+}
+
+fn fmt_vsldoi128(instr: &DecodedInstr) -> DisasmText {
+    let vd = instr.vd128();
+    let va = instr.va128();
+    let vb = instr.vb128();
+    let sh = (instr.raw >> 6) & 0xF;
+    base("vsldoi128", format!("{}, {}, {}, {sh}", vr(vd), vr(va), vr(vb)), 10)
+}
+
+fn fmt_vpermwi128(instr: &DecodedInstr) -> DisasmText {
+    let vd = instr.vd128();
+    let vb = instr.vb128();
+    // UIMM combines bits 11-15 (low 5) with bits 23-25 (upper 3).
+    let lo = (instr.raw >> 16) & 0x1F;
+    let hi = (instr.raw >> 6) & 0x7;
+    let uimm = lo | (hi << 5);
+    base("vpermwi128", format!("{}, {}, 0x{uimm:X}", vr(vd), vr(vb)), 11)
+}
+
+fn fmt_vmx128_pack_d3d(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.vd128();
+    let vb = instr.vb128();
+    let imm = (instr.raw >> 16) & 0x1F;
+    let z = (instr.raw >> 6) & 0x3;
+    base(mnem, format!("{}, {}, {imm}, {z}", vr(vd), vr(vb)), 10)
+}
+
+fn fmt_vmx128_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.vd128();
+    let vb = instr.vb128();
+    base(mnem, format!("{}, {}", vr(vd), vr(vb)), 12)
+}
+
+fn fmt_vmx128_uimm(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.vd128();
+    let vb = instr.vb128();
+    let uimm = extract_vx128_uimm5(instr.raw);
+    base(mnem, format!("{}, {}, {uimm}", vr(vd), vr(vb)), 12)
+}
+
+fn fmt_vmx128_cmp(instr: &DecodedInstr, mnem: &str) -> DisasmText {
+    let vd = instr.vd128();
+    let va = instr.va128();
+    let vb = instr.vb128();
+    // Rc bit at position 25 in VMX128 cmp form.
+    let rc = if (instr.raw >> 6) & 1 != 0 { "." } else { "" };
+    let full = format!("{mnem}{rc}");
+    base(&full, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 14)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::decoder::decode;
 
     #[test]
-    fn test_disasm_nop() {
-        // ori r0, r0, 0 = NOP
+    fn nop_collapses_via_extended() {
         let instr = decode(0x60000000, 0);
-        let text = disassemble(&instr);
-        assert!(text.contains("ori"), "Expected 'ori', got: {}", text);
+        let t = format(&instr);
+        assert_eq!(t.mnemonic, "ori");
+        assert_eq!(t.ext_mnemonic.as_deref(), Some("nop"));
+        assert_eq!(t.display(), "nop");
     }
 
     #[test]
-    fn test_disasm_addi() {
-        let raw = (14u32 << 26) | (3 << 21) | (1 << 16) | 16;
+    fn addi_to_li_when_ra_zero() {
+        // addi r3, r0, 16
+        let raw = (14u32 << 26) | (3 << 21) | (0 << 16) | 16;
         let instr = decode(raw, 0);
-        let text = disassemble(&instr);
-        assert!(text.contains("addi"), "Got: {}", text);
-        assert!(text.contains("r3"), "Got: {}", text);
+        let t = format(&instr);
+        assert_eq!(t.mnemonic, "addi");
+        assert_eq!(t.ext_mnemonic.as_deref(), Some("li"));
+        assert_eq!(t.ext_operands.as_deref(), Some("r3, 16"));
+    }
+
+    #[test]
+    fn rlwinm_dot_preserves_record_bit() {
+        // Same pattern as the Sylpheed graphics-callback test:
+        // rlwinm. r11, r11, 0, 31, 31 with Rc=1
+        let raw = (21u32 << 26) | (11 << 21) | (11 << 16)
+                | (0 << 11) | (31 << 6) | (31 << 1) | 1;
+        let instr = decode(raw, 0);
+        let t = format(&instr);
+        assert!(t.disasm.starts_with("rlwinm."), "got: {}", t.disasm);
+    }
+
+    #[test]
+    fn rlwinm_no_dot_when_rc_unset() {
+        let raw = (21u32 << 26) | (11 << 21) | (11 << 16)
+                | (0 << 11) | (31 << 6) | (31 << 1);
+        let instr = decode(raw, 0);
+        let t = format(&instr);
+        assert_eq!(t.mnemonic, "rlwinm");
+        assert!(!t.mnemonic.ends_with('.'));
+    }
+
+    #[test]
+    fn or_with_same_source_is_mr() {
+        // or r3, r4, r4 → mr r3, r4
+        let raw = (31u32 << 26) | (4 << 21) | (3 << 16) | (4 << 11) | (444 << 1);
+        let instr = decode(raw, 0);
+        let t = format(&instr);
+        assert_eq!(t.ext_mnemonic.as_deref(), Some("mr"));
+        assert_eq!(t.ext_operands.as_deref(), Some("r3, r4"));
+    }
+
+    #[test]
+    fn unconditional_branch_resolves_target() {
+        // b +0x100 with addr=0x82000000
+        let raw = (18u32 << 26) | (0x40 << 2);
+        let instr = decode(raw, 0x82000000);
+        let t = format(&instr);
+        assert_eq!(t.mnemonic, "b");
+        assert_eq!(t.branch_target, Some(0x82000100));
+        assert_eq!(t.operands, "0x82000100");
+    }
+
+    #[test]
+    fn bclr_unconditional_is_blr() {
+        // bclr 20, 0
+        let raw = (19u32 << 26) | (20 << 21) | (0 << 16) | (16 << 1);
+        let instr = decode(raw, 0);
+        let t = format(&instr);
+        assert_eq!(t.ext_mnemonic.as_deref(), Some("blr"));
+    }
+
+    #[test]
+    fn back_compat_disassemble_returns_display() {
+        let instr = decode(0x60000000, 0);
+        assert_eq!(disassemble(&instr), "nop");
+    }
+
+    #[test]
+    fn iter_disasm_walks_byte_slice_in_order() {
+        // Three instructions at 0x82000000: nop, addi r3,r0,16, b +0x100.
+        let mut bytes = Vec::new();
+        bytes.extend_from_slice(&0x60000000u32.to_be_bytes()); // nop
+        bytes.extend_from_slice(&((14u32 << 26) | (3 << 21) | (0 << 16) | 16).to_be_bytes()); // addi
+        bytes.extend_from_slice(&((18u32 << 26) | (0x40 << 2)).to_be_bytes()); // b +0x100
+
+        let items: Vec<_> = super::iter_disasm(&bytes, 0x82000000, 0x82000000, 0x82000000 + 12)
+            .collect();
+        assert_eq!(items.len(), 3);
+        assert_eq!(items[0].addr, 0x82000000);
+        assert_eq!(items[0].text.ext_mnemonic.as_deref(), Some("nop"));
+        assert_eq!(items[1].addr, 0x82000004);
+        assert_eq!(items[1].text.ext_mnemonic.as_deref(), Some("li"));
+        assert_eq!(items[2].addr, 0x82000008);
+        assert_eq!(items[2].text.branch_target, Some(0x82000108));
+    }
+
+    #[test]
+    fn iter_disasm_stops_on_truncated_tail() {
+        // 6 bytes — one full instruction + 2 dangling. Iterator must yield exactly 1.
+        let mut bytes = Vec::new();
+        bytes.extend_from_slice(&0x60000000u32.to_be_bytes());
+        bytes.push(0x60); bytes.push(0x00);
+
+        let items: Vec<_> = super::iter_disasm(&bytes, 0, 0, 6).collect();
+        assert_eq!(items.len(), 1);
     }
 }
diff --git a/crates/xenia-cpu/src/fpscr.rs b/crates/xenia-cpu/src/fpscr.rs
new file mode 100644
index 0000000..1d05e67
--- /dev/null
+++ b/crates/xenia-cpu/src/fpscr.rs
@@ -0,0 +1,384 @@
+//! FPSCR (Floating-Point Status and Control Register) maintenance.
+//!
+//! Scope per project plan: rounding modes honoured, plus the exception bits
+//! games actually read (FX, FEX, VX, OX, UX, ZX, XX, FI, FPRF). Enabled-
+//! exception dispatch (FE[0,1], VE/OE/UE/ZE/XE) is *not* modelled — games
+//! running on Xenon almost never take FP traps.
+//!
+//! Bit layout (PowerISA, MSB-0 numbering; stored in a u32 with bit 31 = MSB):
+//!
+//! | PPC bit | u32 mask                | Name        |
+//! |---------|-------------------------|-------------|
+//! | 0       | `1<<31`                 | FX          |
+//! | 1       | `1<<30`                 | FEX         |
+//! | 2       | `1<<29`                 | VX (summary)|
+//! | 3       | `1<<28`                 | OX          |
+//! | 4       | `1<<27`                 | UX          |
+//! | 5       | `1<<26`                 | ZX          |
+//! | 6       | `1<<25`                 | XX          |
+//! | 7       | `1<<24`                 | VXSNAN      |
+//! | 8       | `1<<23`                 | VXISI       |
+//! | 9       | `1<<22`                 | VXIDI       |
+//! | 10      | `1<<21`                 | VXZDZ       |
+//! | 11      | `1<<20`                 | VXIMZ       |
+//! | 12      | `1<<19`                 | VXVC        |
+//! | 13      | `1<<18`                 | FR          |
+//! | 14      | `1<<17`                 | FI          |
+//! | 15..19  | `0xF8000 >> 15` @ 15..19 | FPRF (5 bits)|
+//! | 21      | `1<<10`                 | VXSOFT      |
+//! | 22      | `1<<9`                  | VXSQRT      |
+//! | 23      | `1<<8`                  | VXCVI       |
+//! | 30..31  | `0x3`                   | RN (2 bits) |
+
+use crate::context::PpcContext;
+
+pub const FX: u32      = 1 << 31;
+pub const FEX: u32     = 1 << 30;
+pub const VX: u32      = 1 << 29;
+pub const OX: u32      = 1 << 28;
+pub const UX: u32      = 1 << 27;
+pub const ZX: u32      = 1 << 26;
+pub const XX: u32      = 1 << 25;
+pub const VXSNAN: u32  = 1 << 24;
+pub const VXISI: u32   = 1 << 23;
+pub const VXIDI: u32   = 1 << 22;
+pub const VXZDZ: u32   = 1 << 21;
+pub const VXIMZ: u32   = 1 << 20;
+pub const VXVC: u32    = 1 << 19;
+pub const FR: u32      = 1 << 18;
+pub const FI: u32      = 1 << 17;
+pub const FPRF_MASK: u32 = 0x1F << 12;  // bits 15..19
+pub const VXSOFT: u32  = 1 << 10;
+pub const VXSQRT: u32  = 1 << 9;
+pub const VXCVI: u32   = 1 << 8;
+pub const RN_MASK: u32 = 0x3;
+
+/// Union of all VX* bits (used for the VX summary recomputation).
+pub const VX_ALL: u32 = VXSNAN | VXISI | VXIDI | VXZDZ | VXIMZ | VXVC | VXSOFT | VXSQRT | VXCVI;
+
+/// FPRF classification codes (5-bit, placed in FPSCR bits 15..19).
+/// The high bit ("C" in PowerISA) distinguishes ±zero/±denormal/QNaN from
+/// ±normal/±inf. The next 4 bits are (FL, FG, FE, FU) = (less, greater, equal, unordered).
+pub mod fprf {
+    pub const QNAN: u8         = 0b1_0001;
+    pub const NEG_INF: u8      = 0b0_1001;
+    pub const NEG_NORMAL: u8   = 0b0_1000;
+    pub const NEG_DENORMAL: u8 = 0b1_1000;
+    pub const NEG_ZERO: u8     = 0b1_0010;
+    pub const POS_ZERO: u8     = 0b0_0010;
+    pub const POS_DENORMAL: u8 = 0b1_0100;
+    pub const POS_NORMAL: u8   = 0b0_0100;
+    pub const POS_INF: u8      = 0b0_0101;
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum RoundingMode {
+    NearestEven,    // RN=00
+    TowardZero,     // RN=01
+    TowardPosInf,   // RN=10
+    TowardNegInf,   // RN=11
+}
+
+pub fn rounding_mode(ctx: &PpcContext) -> RoundingMode {
+    match ctx.fpscr & RN_MASK {
+        0 => RoundingMode::NearestEven,
+        1 => RoundingMode::TowardZero,
+        2 => RoundingMode::TowardPosInf,
+        _ => RoundingMode::TowardNegInf,
+    }
+}
+
+/// Classify a finite f64 into its FPRF 5-bit code.
+pub fn classify_fprf(v: f64) -> u8 {
+    if v.is_nan() {
+        fprf::QNAN
+    } else if v.is_infinite() {
+        if v.is_sign_negative() { fprf::NEG_INF } else { fprf::POS_INF }
+    } else if v == 0.0 {
+        if v.is_sign_negative() { fprf::NEG_ZERO } else { fprf::POS_ZERO }
+    } else if v.is_subnormal() {
+        if v.is_sign_negative() { fprf::NEG_DENORMAL } else { fprf::POS_DENORMAL }
+    } else if v.is_sign_negative() { fprf::NEG_NORMAL } else { fprf::POS_NORMAL }
+}
+
+/// Write FPRF into FPSCR, preserving other bits.
+pub fn set_fprf(ctx: &mut PpcContext, code: u8) {
+    ctx.fpscr = (ctx.fpscr & !FPRF_MASK) | ((code as u32 & 0x1F) << 12);
+}
+
+/// Set one or more exception bits on FPSCR, maintaining FX (sticky set on any
+/// new exception) and VX (summary of VX* bits).
+pub fn set_exception(ctx: &mut PpcContext, bits: u32) {
+    let prev = ctx.fpscr;
+    let new = prev | bits;
+    // FX is sticky-set if any new non-sticky bit transitions to 1. PPC defines
+    // FX as "any of OX, UX, ZX, XX, VX* newly set". Compute the transition set.
+    let transition = (new & !prev) & (OX | UX | ZX | XX | VX_ALL);
+    let mut updated = new;
+    if transition != 0 {
+        updated |= FX;
+    }
+    // Recompute VX summary from any VX* bits currently set.
+    if (updated & VX_ALL) != 0 { updated |= VX; }
+    ctx.fpscr = updated;
+}
+
+/// Classify the inputs of a floating-point arithmetic op and set appropriate
+/// VX* bits. Returns true if any invalid-operation was detected (caller may
+/// want to write a default QNaN result).
+///
+/// Detected cases:
+///   * any SNaN input → VXSNAN
+///   * infinity - infinity (same sign) → VXISI
+///   * 0 / 0 → VXZDZ
+///   * infinity / infinity → VXIDI
+///   * 0 * infinity → VXIMZ
+pub fn check_invalid_add(ctx: &mut PpcContext, a: f64, b: f64, sub: bool) -> bool {
+    let mut bits = 0u32;
+    if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
+    if a.is_infinite() && b.is_infinite() {
+        // For add: VXISI iff same-sign(a,b) negated — inf - inf
+        // For sub: VXISI iff same-sign(a,b) — (+inf) - (+inf) or (-inf) - (-inf)
+        let both_pos = a.is_sign_positive() && b.is_sign_positive();
+        let both_neg = a.is_sign_negative() && b.is_sign_negative();
+        if sub {
+            if both_pos || both_neg { bits |= VXISI; }
+        } else {
+            // add: opposite signs cancel to inf-inf
+            if a.is_sign_positive() != b.is_sign_positive() { bits |= VXISI; }
+        }
+    }
+    if bits != 0 { set_exception(ctx, bits); return true; }
+    false
+}
+
+pub fn check_invalid_mul(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
+    let mut bits = 0u32;
+    if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
+    let zero_times_inf =
+        (a == 0.0 && b.is_infinite()) || (b == 0.0 && a.is_infinite());
+    if zero_times_inf { bits |= VXIMZ; }
+    if bits != 0 { set_exception(ctx, bits); return true; }
+    false
+}
+
+pub fn check_invalid_div(ctx: &mut PpcContext, a: f64, b: f64) -> bool {
+    let mut bits = 0u32;
+    if is_snan(a) || is_snan(b) { bits |= VXSNAN; }
+    if a == 0.0 && b == 0.0 { bits |= VXZDZ; }
+    if a.is_infinite() && b.is_infinite() { bits |= VXIDI; }
+    if bits != 0 { set_exception(ctx, bits); return true; }
+    false
+}
+
+/// Divide-by-zero (finite nonzero / 0) — sets ZX but not VX.
+pub fn check_zero_divide(ctx: &mut PpcContext, a: f64, b: f64) {
+    if b == 0.0 && a != 0.0 && !a.is_nan() && !a.is_infinite() {
+        set_exception(ctx, ZX);
+    }
+}
+
+/// Post-op: classify the result and update FPRF + detect overflow/underflow/inexact.
+/// `inputs_finite` lets us suppress OX for ops whose output is infinite because
+/// an input already was.
+pub fn update_after_op(ctx: &mut PpcContext, result: f64, inputs_were_finite: bool) {
+    let mut bits = 0u32;
+    if result.is_infinite() && inputs_were_finite {
+        bits |= OX;
+    }
+    if result.is_subnormal() {
+        bits |= UX;
+    }
+    if bits != 0 { set_exception(ctx, bits); }
+    set_fprf(ctx, classify_fprf(result));
+}
+
+/// Test whether an f64 is a signalling NaN.
+/// In IEEE 754-2008 (binary64), the signalling bit is the high bit of the
+/// mantissa. SNaN has it clear, QNaN has it set. NaN with high mantissa bit
+/// clear (and mantissa nonzero) is an SNaN.
+pub fn is_snan(x: f64) -> bool {
+    if !x.is_nan() { return false; }
+    let bits = x.to_bits();
+    // Highest mantissa bit (bit 51) clear ⇒ SNaN. Mantissa nonzero always true for NaN.
+    (bits & (1u64 << 51)) == 0
+}
+
+/// Round an f64 to f32 honouring FPSCR[RN]. Uses the current hardware
+/// rounding mode when RN=0 (nearest-even, the PPC default), otherwise
+/// emulates the directed rounding via bit-manipulation.
+pub fn round_to_single(ctx: &PpcContext, v: f64) -> f64 {
+    match rounding_mode(ctx) {
+        RoundingMode::NearestEven => (v as f32) as f64,
+        RoundingMode::TowardZero => round_single_toward_zero(v) as f64,
+        RoundingMode::TowardPosInf => round_single_toward_pos_inf(v) as f64,
+        RoundingMode::TowardNegInf => round_single_toward_neg_inf(v) as f64,
+    }
+}
+
+/// Round an f64 to an i64 integer honouring FPSCR[RN]. Used by fctidx.
+pub fn round_to_i64(ctx: &PpcContext, v: f64) -> i64 {
+    match rounding_mode(ctx) {
+        RoundingMode::NearestEven => {
+            // Round-half-to-even (banker's rounding).
+            let r = v.round();
+            // Rust's f64::round is round-half-away-from-zero. Correct ties to even:
+            let diff = (v - v.trunc()).abs();
+            if (diff - 0.5).abs() < f64::EPSILON {
+                let floor = v.floor();
+                if (floor as i64) & 1 == 0 { floor as i64 } else { v.ceil() as i64 }
+            } else {
+                r as i64
+            }
+        }
+        RoundingMode::TowardZero => v.trunc() as i64,
+        RoundingMode::TowardPosInf => v.ceil() as i64,
+        RoundingMode::TowardNegInf => v.floor() as i64,
+    }
+}
+
+/// Round an f64 to an i32 integer honouring FPSCR[RN]. Used by fctiwx.
+pub fn round_to_i32(ctx: &PpcContext, v: f64) -> i32 {
+    round_to_i64(ctx, v).clamp(i32::MIN as i64, i32::MAX as i64) as i32
+}
+
+// ------ directed rounding helpers (f64 → f32) ------
+
+fn round_single_toward_zero(v: f64) -> f32 {
+    // Default f64→f32 is round-to-nearest-even. Emulate truncation:
+    // take the default rounded value; if the absolute rounded magnitude
+    // exceeds |v|, bump down by one ULP toward zero.
+    let rn = v as f32;
+    if rn.is_nan() || rn.is_infinite() || rn == 0.0 { return rn; }
+    if rn.abs() as f64 <= v.abs() { return rn; }
+    let adj_bits = rn.to_bits();
+    let lower = if rn.is_sign_positive() { adj_bits - 1 } else { adj_bits - 1 };
+    f32::from_bits(lower)
+}
+
+fn round_single_toward_pos_inf(v: f64) -> f32 {
+    let rn = v as f32;
+    if rn.is_nan() || rn.is_infinite() { return rn; }
+    if (rn as f64) >= v { return rn; }
+    // rn < v — bump up by one ULP in the +direction.
+    let b = rn.to_bits();
+    let nb = if rn.is_sign_negative() { b - 1 } else { b + 1 };
+    f32::from_bits(nb)
+}
+
+fn round_single_toward_neg_inf(v: f64) -> f32 {
+    let rn = v as f32;
+    if rn.is_nan() || rn.is_infinite() { return rn; }
+    if (rn as f64) <= v { return rn; }
+    // rn > v — bump down.
+    let b = rn.to_bits();
+    let nb = if rn.is_sign_negative() { b + 1 } else { b - 1 };
+    f32::from_bits(nb)
+}
+
+/// Drop-in replacement for the old `update_cr1_from_fpscr`. Reads the
+/// currently-maintained FPSCR bits (FX, FEX, VX, OX) into CR1.
+pub fn update_cr1(ctx: &mut PpcContext) {
+    ctx.cr[1].lt = (ctx.fpscr & FX) != 0;
+    ctx.cr[1].gt = (ctx.fpscr & FEX) != 0;
+    ctx.cr[1].eq = (ctx.fpscr & VX) != 0;
+    ctx.cr[1].so = (ctx.fpscr & OX) != 0;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn ctx() -> PpcContext { PpcContext::new() }
+
+    #[test]
+    fn rn_default_is_nearest() {
+        assert_eq!(rounding_mode(&ctx()), RoundingMode::NearestEven);
+    }
+
+    #[test]
+    fn rn_bits_decode() {
+        let mut c = ctx();
+        c.fpscr = 0x1;
+        assert_eq!(rounding_mode(&c), RoundingMode::TowardZero);
+        c.fpscr = 0x2;
+        assert_eq!(rounding_mode(&c), RoundingMode::TowardPosInf);
+        c.fpscr = 0x3;
+        assert_eq!(rounding_mode(&c), RoundingMode::TowardNegInf);
+    }
+
+    #[test]
+    fn fprf_classifies_correctly() {
+        assert_eq!(classify_fprf(1.0), fprf::POS_NORMAL);
+        assert_eq!(classify_fprf(-1.0), fprf::NEG_NORMAL);
+        assert_eq!(classify_fprf(0.0), fprf::POS_ZERO);
+        assert_eq!(classify_fprf(-0.0), fprf::NEG_ZERO);
+        assert_eq!(classify_fprf(f64::INFINITY), fprf::POS_INF);
+        assert_eq!(classify_fprf(f64::NEG_INFINITY), fprf::NEG_INF);
+        assert_eq!(classify_fprf(f64::NAN), fprf::QNAN);
+        assert_eq!(classify_fprf(f64::from_bits(1)), fprf::POS_DENORMAL);
+    }
+
+    #[test]
+    fn fx_is_sticky_on_new_exception() {
+        let mut c = ctx();
+        set_exception(&mut c, OX);
+        assert_ne!(c.fpscr & FX, 0);
+        // Clear FX/OX manually.
+        c.fpscr &= !(FX | OX);
+        // Re-set OX; FX should re-latch.
+        set_exception(&mut c, OX);
+        assert_ne!(c.fpscr & FX, 0);
+    }
+
+    #[test]
+    fn vx_summary_set_on_any_vx_bit() {
+        let mut c = ctx();
+        set_exception(&mut c, VXSNAN);
+        assert_ne!(c.fpscr & VX, 0);
+        assert_ne!(c.fpscr & VXSNAN, 0);
+    }
+
+    #[test]
+    fn round_to_single_nearest_is_identity_on_representable() {
+        let c = ctx();
+        assert_eq!(round_to_single(&c, 1.0_f64), 1.0_f64);
+    }
+
+    #[test]
+    fn round_to_i32_clamps_out_of_range() {
+        let c = ctx();
+        assert_eq!(round_to_i32(&c, 1e20_f64), i32::MAX);
+        assert_eq!(round_to_i32(&c, -1e20_f64), i32::MIN);
+    }
+
+    #[test]
+    fn round_to_i64_nearest_even_on_tie() {
+        let c = ctx();
+        assert_eq!(round_to_i64(&c, 2.5_f64), 2);
+        assert_eq!(round_to_i64(&c, 3.5_f64), 4);
+        assert_eq!(round_to_i64(&c, -2.5_f64), -2);
+    }
+
+    #[test]
+    fn check_invalid_add_detects_inf_minus_inf() {
+        let mut c = ctx();
+        assert!(check_invalid_add(&mut c, f64::INFINITY, f64::INFINITY, true));
+        assert_ne!(c.fpscr & VXISI, 0);
+    }
+
+    #[test]
+    fn check_invalid_div_detects_zero_over_zero() {
+        let mut c = ctx();
+        assert!(check_invalid_div(&mut c, 0.0, 0.0));
+        assert_ne!(c.fpscr & VXZDZ, 0);
+    }
+
+    #[test]
+    fn snan_detection() {
+        // SNaN in binary64: sign=0, exp=all-ones, mantissa nonzero with bit 51 clear.
+        let snan = f64::from_bits(0x7FF0_0000_0000_0001);
+        assert!(is_snan(snan));
+        assert!(!is_snan(f64::NAN));
+    }
+}
diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs
index 02c4bd7..4d26a97 100644
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -4,9 +4,17 @@
 
 use crate::context::PpcContext;
 use crate::decoder::{decode, DecodedInstr};
+use crate::fpscr;
 use crate::opcode::PpcOpcode;
+use crate::overflow;
+use crate::trap;
+use crate::vmx;
 use xenia_memory::MemoryAccess;
 
+/// Xenon reservation granule: one L2 cache line (128 bytes).
+/// `reserved_line = ea & !RESERVATION_MASK` in [context::PpcContext].
+pub const RESERVATION_MASK: u32 = 0x7F;
+
 /// Result of executing a single instruction.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum StepResult {
@@ -23,7 +31,7 @@ pub enum StepResult {
 }
 
 /// Execute a single PPC instruction.
-pub fn step(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess) -> StepResult {
+pub fn step(ctx: &mut PpcContext, mem: &dyn MemoryAccess) -> StepResult {
     let raw = mem.read_u32(ctx.pc);
     let instr = decode(raw, ctx.pc);
 
@@ -35,8 +43,72 @@ pub fn step(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess) -> StepResult {
     result
 }
 
+/// Tier-2 perf — same semantics as [`step`], but looks the decoded
+/// instruction up in a PC-keyed cache first. Misses fill the cache from
+/// a fresh [`decode`] call; writes to the containing guest page bump
+/// `page_version` and naturally invalidate the entry.
+///
+/// The cache is shared across all HW threads — PC is thread-independent
+/// and `DecodeCacheEntry` stays put after fill. `current_page_version`
+/// is wired through the caller since memory is touched just above anyway
+/// (the `read_u32` + the version read amortize to one touch of the page
+/// table). Use `GuestMemory::page_version(pc)` to source it.
+pub fn step_cached(
+    ctx: &mut PpcContext,
+    mem: &dyn MemoryAccess,
+    cache: &mut crate::decoder::DecodeCache,
+    current_page_version: u64,
+) -> StepResult {
+    let raw = mem.read_u32(ctx.pc);
+    let instr = cache.lookup(ctx.pc, raw, current_page_version);
+
+    let result = execute(ctx, mem, &instr);
+
+    ctx.cycle_count += 1;
+    ctx.timebase += 1;
+
+    result
+}
+
+/// Tier-4 perf — execute every instruction in a pre-decoded
+/// [`crate::block_cache::DecodedBlock`], bumping `cycle_count` and
+/// `timebase` once per executed instruction. Bails out as soon as a
+/// non-`Continue` step result fires (system call, trap, halt, or
+/// unimplemented opcode), or when an instruction unexpectedly changes
+/// the PC mid-block (defensive — only the terminator at the tail of
+/// the block is allowed to do that).
+///
+/// Caller (in `xenia-app/src/main.rs`) is responsible for choosing this
+/// path only when **no per-instruction observation is requested** —
+/// i.e., `Debugger::wants_hooks() == false` and no `--trace-*` flag is
+/// active. Once those gates flip, the caller falls back to
+/// [`step_cached`] so every PC remains observable.
+pub fn step_block(
+    ctx: &mut PpcContext,
+    mem: &dyn MemoryAccess,
+    block: &crate::block_cache::DecodedBlock,
+) -> StepResult {
+    let mut result = StepResult::Continue;
+    for instr in &block.instrs {
+        let expected_next = instr.addr.wrapping_add(4);
+        result = execute(ctx, mem, instr);
+        ctx.cycle_count += 1;
+        ctx.timebase += 1;
+        if !matches!(result, StepResult::Continue) {
+            return result;
+        }
+        // PC discontinuity within a block. By construction only the
+        // terminator (last instruction) can branch — and when it does,
+        // we want to stop here, not continue executing past it.
+        if ctx.pc != expected_next {
+            break;
+        }
+    }
+    result
+}
+
 /// Execute a decoded instruction, updating context and memory.
-fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInstr) -> StepResult {
+fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -> StepResult {
     match instr.opcode {
         // ===== ALU: Immediate =====
         PpcOpcode::addi => {
@@ -45,8 +117,17 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             ctx.pc += 4;
         }
         PpcOpcode::addis => {
-            let ra_val = if instr.ra() == 0 { 0 } else { ctx.gpr[instr.ra()] };
-            ctx.gpr[instr.rd()] = ra_val.wrapping_add((instr.simm16() as i64 as u64) << 16);
+            // Xbox 360 user mode is 32-bit ABI (MSR.SF=0), so addis must
+            // produce a value whose upper 32 bits don't pollute downstream
+            // 64-bit arithmetic. The PPC ISA in 64-bit mode sign-extends
+            // simm16 before the shift, producing 0xFFFFFFFF_xxxx0000 for
+            // negative simm16 (high bit set). When this value flows into
+            // a 64-bit subfc against a zero-extended lwz value, the unsigned
+            // 64-bit comparison yields wrong CA. Truncate to 32 bits to
+            // simulate 32-bit ABI behavior.
+            let ra_val = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+            let result = ra_val.wrapping_add((instr.simm16() as i64 as u64) << 16);
+            ctx.gpr[instr.rd()] = result as u32 as u64;
             ctx.pc += 4;
         }
         PpcOpcode::addic => {
@@ -64,7 +145,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             ctx.xer_ca = if result < ra { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
             // Update CR0
-            ctx.update_cr_signed(0, result as i32 as i64);
+            ctx.update_cr_signed(0, result as i64);
             ctx.pc += 4;
         }
         PpcOpcode::subficx => {
@@ -89,10 +170,10 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = ra.wrapping_add(rb);
             ctx.gpr[instr.rd()] = result;
             if instr.oe() {
-                // TODO: overflow detection
+                overflow::apply(ctx, overflow::add_ov_64(ra, rb, result));
             }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -102,8 +183,11 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = ra.wrapping_add(rb);
             ctx.xer_ca = if result < ra { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                overflow::apply(ctx, overflow::add_ov_64(ra, rb, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -114,8 +198,12 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = ra.wrapping_add(rb).wrapping_add(ca);
             ctx.xer_ca = if result < ra || (ca != 0 && result == ra) { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                let true_sum = (ra as i64 as i128) + (rb as i64 as i128) + (ca as i128);
+                overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -125,8 +213,12 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = ra.wrapping_add(ca);
             ctx.xer_ca = if result < ra { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                let true_sum = (ra as i64 as i128) + (ca as i128);
+                overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -136,8 +228,13 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = ra.wrapping_add(ca).wrapping_sub(1);
             ctx.xer_ca = if ra != 0 || ca != 0 { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                // RT <- RA + CA + (-1)
+                let true_sum = (ra as i64 as i128) + (ca as i128) - 1;
+                overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -146,8 +243,11 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let rb = ctx.gpr[instr.rb()];
             let result = rb.wrapping_sub(ra);
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                overflow::apply(ctx, overflow::sub_ov_64(ra, rb, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -157,8 +257,11 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = rb.wrapping_sub(ra);
             ctx.xer_ca = if rb >= ra { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                overflow::apply(ctx, overflow::sub_ov_64(ra, rb, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -169,8 +272,13 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = (!ra).wrapping_add(rb).wrapping_add(ca);
             ctx.xer_ca = if rb > ra || (rb == ra && ca != 0) { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                // RT <- !RA + RB + CA  ==  RB - RA - 1 + CA
+                let true_sum = (rb as i64 as i128) - (ra as i64 as i128) - 1 + (ca as i128);
+                overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -178,10 +286,17 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let ra = ctx.gpr[instr.ra()];
             let ca = ctx.xer_ca as u64;
             let result = (!ra).wrapping_add(ca);
-            ctx.xer_ca = if !ra != 0 || ca != 0 { 1 } else { 0 };
+            // RT <- !RA + CA (no -1 term). 64-bit carry-out only when
+            // !RA = u64::MAX (i.e. RA = 0) AND CA = 1.
+            ctx.xer_ca = if ra == 0 && ca != 0 { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                // RT <- !RA + CA  ==  -RA - 1 + CA
+                let true_sum = -(ra as i64 as i128) - 1 + (ca as i128);
+                overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
@@ -191,25 +306,39 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = (!ra).wrapping_add(ca).wrapping_sub(1);
             ctx.xer_ca = if (!ra) != 0 || ca != 0 { 1 } else { 0 };
             ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                // RT <- !RA + CA + (-1)  ==  -RA - 2 + CA
+                let true_sum = -(ra as i64 as i128) - 2 + (ca as i128);
+                overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, result as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
         PpcOpcode::negx => {
             let ra = ctx.gpr[instr.ra()];
-            ctx.gpr[instr.rd()] = (!ra).wrapping_add(1);
+            let result = (!ra).wrapping_add(1);
+            ctx.gpr[instr.rd()] = result;
+            if instr.oe() {
+                overflow::apply(ctx, overflow::neg_ov_64(ra));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64);
+                ctx.update_cr_signed(0, result as i64);
             }
             ctx.pc += 4;
         }
         PpcOpcode::mullwx => {
             let ra = ctx.gpr[instr.ra()] as i32 as i64;
             let rb = ctx.gpr[instr.rb()] as i32 as i64;
-            ctx.gpr[instr.rd()] = ra.wrapping_mul(rb) as u64;
+            let product = ra.wrapping_mul(rb);
+            ctx.gpr[instr.rd()] = product as u64;
+            if instr.oe() {
+                // OV iff the 64-bit product can't fit into 32-bit signed.
+                overflow::apply(ctx, overflow::mullw_ov(product));
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64);
+                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
             ctx.pc += 4;
         }
@@ -219,7 +348,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = ra.wrapping_mul(rb);
             ctx.gpr[instr.rd()] = ((result >> 32) as i32 as i64 as u64) & 0xFFFF_FFFF;
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64);
+                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
             ctx.pc += 4;
         }
@@ -229,33 +358,42 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let result = ra.wrapping_mul(rb);
             ctx.gpr[instr.rd()] = (result >> 32) & 0xFFFF_FFFF;
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64);
+                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
             ctx.pc += 4;
         }
         PpcOpcode::divwx => {
             let ra = ctx.gpr[instr.ra()] as i32;
             let rb = ctx.gpr[instr.rb()] as i32;
-            if rb == 0 || (ra == i32::MIN && rb == -1) {
+            let ov = overflow::divw_ov_signed(ra, rb);
+            if ov {
+                // PPC: RT undefined on div-by-zero / INT_MIN/-1. Canary uses 0.
                 ctx.gpr[instr.rd()] = 0;
             } else {
                 ctx.gpr[instr.rd()] = (ra / rb) as i64 as u64;
             }
+            if instr.oe() {
+                overflow::apply(ctx, ov);
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64);
+                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
             ctx.pc += 4;
         }
         PpcOpcode::divwux => {
             let ra = ctx.gpr[instr.ra()] as u32;
             let rb = ctx.gpr[instr.rb()] as u32;
-            if rb == 0 {
+            let ov = overflow::divw_ov_unsigned(rb);
+            if ov {
                 ctx.gpr[instr.rd()] = 0;
             } else {
                 ctx.gpr[instr.rd()] = (ra / rb) as u64;
             }
+            if instr.oe() {
+                overflow::apply(ctx, ov);
+            }
             if instr.rc_bit() {
-                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64);
+                ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
             ctx.pc += 4;
         }
@@ -265,6 +403,9 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let ra = ctx.gpr[instr.ra()] as i64;
             let rb = ctx.gpr[instr.rb()] as i64;
             ctx.gpr[instr.rd()] = ra.wrapping_mul(rb) as u64;
+            if instr.oe() {
+                overflow::apply(ctx, overflow::mulld_ov(ra, rb));
+            }
             if instr.rc_bit() {
                 ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
@@ -291,11 +432,15 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         PpcOpcode::divdx => {
             let ra = ctx.gpr[instr.ra()] as i64;
             let rb = ctx.gpr[instr.rb()] as i64;
-            if rb == 0 || (ra == i64::MIN && rb == -1) {
+            let ov = overflow::divd_ov_signed(ra, rb);
+            if ov {
                 ctx.gpr[instr.rd()] = 0;
             } else {
                 ctx.gpr[instr.rd()] = (ra / rb) as u64;
             }
+            if instr.oe() {
+                overflow::apply(ctx, ov);
+            }
             if instr.rc_bit() {
                 ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
@@ -304,11 +449,15 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         PpcOpcode::divdux => {
             let ra = ctx.gpr[instr.ra()];
             let rb = ctx.gpr[instr.rb()];
-            if rb == 0 {
+            let ov = overflow::divd_ov_unsigned(rb);
+            if ov {
                 ctx.gpr[instr.rd()] = 0;
             } else {
                 ctx.gpr[instr.rd()] = ra / rb;
             }
+            if instr.oe() {
+                overflow::apply(ctx, ov);
+            }
             if instr.rc_bit() {
                 ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64);
             }
@@ -318,12 +467,12 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         // ===== Logical =====
         PpcOpcode::andix => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & (instr.uimm16() as u64);
-            ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64);
+            ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64);
             ctx.pc += 4;
         }
         PpcOpcode::andisx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & ((instr.uimm16() as u64) << 16);
-            ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64);
+            ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64);
             ctx.pc += 4;
         }
         PpcOpcode::ori => {
@@ -344,54 +493,54 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         }
         PpcOpcode::andx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & ctx.gpr[instr.rb()];
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::andcx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & !ctx.gpr[instr.rb()];
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::orx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] | ctx.gpr[instr.rb()];
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::orcx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] | !ctx.gpr[instr.rb()];
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::xorx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] ^ ctx.gpr[instr.rb()];
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::norx => {
             ctx.gpr[instr.ra()] = !(ctx.gpr[instr.rs()] | ctx.gpr[instr.rb()]);
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::nandx => {
             ctx.gpr[instr.ra()] = !(ctx.gpr[instr.rs()] & ctx.gpr[instr.rb()]);
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::eqvx => {
             ctx.gpr[instr.ra()] = !(ctx.gpr[instr.rs()] ^ ctx.gpr[instr.rb()]);
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
 
         // ===== Extend/Count =====
         PpcOpcode::extsbx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] as i8 as i64 as u64;
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::extshx => {
             ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] as i16 as i64 as u64;
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::extswx => {
@@ -401,7 +550,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         }
         PpcOpcode::cntlzwx => {
             ctx.gpr[instr.ra()] = (ctx.gpr[instr.rs()] as u32).leading_zeros() as u64;
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::cntlzdx => {
@@ -416,7 +565,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             ctx.gpr[instr.ra()] = if sh < 32 {
                 ((ctx.gpr[instr.rs()] as u32) << sh) as u64
             } else { 0 };
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::srwx => {
@@ -424,7 +573,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             ctx.gpr[instr.ra()] = if sh < 32 {
                 ((ctx.gpr[instr.rs()] as u32) >> sh) as u64
             } else { 0 };
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::srawx => {
@@ -441,7 +590,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
                 ctx.gpr[instr.ra()] = if rs < 0 { u64::MAX } else { 0 };
                 ctx.xer_ca = if rs < 0 { 1 } else { 0 };
             }
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::srawix => {
@@ -455,7 +604,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
                 ctx.xer_ca = if rs < 0 && (rs as u32) << (32 - sh) != 0 { 1 } else { 0 };
                 ctx.gpr[instr.ra()] = result as i64 as u64;
             }
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::sldx => {
@@ -515,7 +664,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let rotated = rs.rotate_left(sh);
             let mask = rlw_mask(mb, me);
             ctx.gpr[instr.ra()] = (rotated & mask) as u64;
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::rlwimix => {
@@ -527,7 +676,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let mask = rlw_mask(mb, me);
             let ra = ctx.gpr[instr.ra()] as u32;
             ctx.gpr[instr.ra()] = ((rotated & mask) | (ra & !mask)) as u64;
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::rlwnmx => {
@@ -538,7 +687,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let rotated = rs.rotate_left(sh);
             let mask = rlw_mask(mb, me);
             ctx.gpr[instr.ra()] = (rotated & mask) as u64;
-            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); }
+            if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); }
             ctx.pc += 4;
         }
         PpcOpcode::rldiclx => {
@@ -606,16 +755,26 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         PpcOpcode::cmpi => {
             let bf = instr.crfd();
             if instr.l() {
-                // 64-bit compare
+                // 64-bit compare. Compare directly so boundary i64 values
+                // (e.g. ra=i64::MIN, imm=1) don't mis-sign through a
+                // wrapped subtract.
                 let ra = ctx.gpr[instr.ra()] as i64;
                 let imm = instr.simm16() as i64;
-                ctx.update_cr_signed(bf, ra - imm);
-                if ra == imm { ctx.cr[bf].eq = true; }
+                ctx.cr[bf] = crate::context::CrField {
+                    lt: ra < imm,
+                    gt: ra > imm,
+                    eq: ra == imm,
+                    so: ctx.xer_so != 0,
+                };
             } else {
                 let ra = ctx.gpr[instr.ra()] as i32;
                 let imm = instr.simm16() as i32;
-                ctx.update_cr_signed(bf, (ra as i64) - (imm as i64));
-                if ra == imm { ctx.cr[bf].eq = true; }
+                ctx.cr[bf] = crate::context::CrField {
+                    lt: ra < imm,
+                    gt: ra > imm,
+                    eq: ra == imm,
+                    so: ctx.xer_so != 0,
+                };
             }
             ctx.pc += 4;
         }
@@ -637,13 +796,21 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             if instr.l() {
                 let ra = ctx.gpr[instr.ra()] as i64;
                 let rb = ctx.gpr[instr.rb()] as i64;
-                ctx.update_cr_signed(bf, ra.wrapping_sub(rb));
-                if ra == rb { ctx.cr[bf].eq = true; }
+                ctx.cr[bf] = crate::context::CrField {
+                    lt: ra < rb,
+                    gt: ra > rb,
+                    eq: ra == rb,
+                    so: ctx.xer_so != 0,
+                };
             } else {
                 let ra = ctx.gpr[instr.ra()] as i32;
                 let rb = ctx.gpr[instr.rb()] as i32;
-                ctx.update_cr_signed(bf, (ra as i64).wrapping_sub(rb as i64));
-                if ra == rb { ctx.cr[bf].eq = true; }
+                ctx.cr[bf] = crate::context::CrField {
+                    lt: ra < rb,
+                    gt: ra > rb,
+                    eq: ra == rb,
+                    so: ctx.xer_so != 0,
+                };
             }
             ctx.pc += 4;
         }
@@ -941,24 +1108,72 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         }
 
         // Reservation (lwarx/stwcx)
+        //
+        // M3.7 — when `ctx.reservation_table` is `Some` and the table is
+        // enabled, route reservations through the inter-thread table so
+        // concurrent host threads can mediate reservation conflicts.
+        // Otherwise (the default in lockstep mode), use the legacy
+        // per-`PpcContext` fields. Both paths leave the per-ctx fields
+        // in a coherent state so a flag flip mid-run doesn't corrupt
+        // outstanding reservations.
         PpcOpcode::lwarx => {
             let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
             let ea = ea.wrapping_add(ctx.gpr[instr.rb()]) as u32;
             let val = mem.read_u32(ea);
             ctx.gpr[instr.rd()] = val as u64;
-            ctx.reserved_addr = ea;
+            ctx.reserved_line = ea & !RESERVATION_MASK;
             ctx.reserved_val = val as u64;
             ctx.has_reservation = true;
+            if let Some(t) = &ctx.reservation_table {
+                if t.is_enabled() {
+                    ctx.reserved_generation = t.reserve(ea, ctx.hw_id);
+                }
+            }
             ctx.pc += 4;
         }
         PpcOpcode::stwcx => {
             let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
             let ea = ea.wrapping_add(ctx.gpr[instr.rb()]) as u32;
-            if ctx.has_reservation && ctx.reserved_addr == ea {
-                mem.write_u32(ea, ctx.gpr[instr.rs()] as u32);
-                ctx.cr[0] = crate::context::CrField { lt: false, gt: false, eq: true, so: ctx.xer_so != 0 };
+            let line = ea & !RESERVATION_MASK;
+            let table_route = ctx
+                .reservation_table
+                .as_ref()
+                .filter(|t| t.is_enabled())
+                .cloned();
+            let success = if let Some(t) = &table_route {
+                // Table-routed: success iff the slot still holds our
+                // reservation AND the per-ctx flag agrees (the per-ctx
+                // flag would be cleared by an intervening write or
+                // context switch).
+                ctx.has_reservation
+                    && ctx.reserved_line == line
+                    && t.try_commit(ea, ctx.reserved_generation, ctx.hw_id)
             } else {
-                ctx.cr[0] = crate::context::CrField { lt: false, gt: false, eq: false, so: ctx.xer_so != 0 };
+                // Legacy per-ctx path (M2 default).
+                ctx.has_reservation && ctx.reserved_line == line
+            };
+            if success {
+                mem.write_u32(ea, ctx.gpr[instr.rs()] as u32);
+                ctx.cr[0] = crate::context::CrField {
+                    lt: false,
+                    gt: false,
+                    eq: true,
+                    so: ctx.xer_so != 0,
+                };
+            } else {
+                ctx.cr[0] = crate::context::CrField {
+                    lt: false,
+                    gt: false,
+                    eq: false,
+                    so: ctx.xer_so != 0,
+                };
+                // Failed stwcx: if we held the reservation in the table
+                // (someone else displaced our gen), release it from the
+                // counter so `has_active_reservers` returns to zero
+                // when no real reserver exists.
+                if let Some(t) = &table_route {
+                    t.release(ea, ctx.reserved_generation, ctx.hw_id);
+                }
             }
             ctx.has_reservation = false;
             ctx.pc += 4;
@@ -1164,8 +1379,22 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
                 crate::context::spr::XER => ctx.xer() as u64,
                 crate::context::spr::LR => ctx.lr,
                 crate::context::spr::CTR => ctx.ctr,
+                crate::context::spr::DEC => ctx.dec as u64,
                 crate::context::spr::TBL => ctx.timebase & 0xFFFF_FFFF,
                 crate::context::spr::TBU => ctx.timebase >> 32,
+                crate::context::spr::VRSAVE => ctx.vrsave as u64,
+                // Xbox 360 Xenon processor signature (from canary).
+                crate::context::spr::PVR => 0x0071_0800,
+                // Benign SPRs — titles read these but we don't model them.
+                crate::context::spr::SPRG0
+                | crate::context::spr::SPRG1
+                | crate::context::spr::SPRG2
+                | crate::context::spr::SPRG3
+                | crate::context::spr::HID0
+                | crate::context::spr::HID1
+                | crate::context::spr::DAR
+                | crate::context::spr::DSISR
+                | crate::context::spr::PIR => 0,
                 _ => {
                     tracing::warn!("mfspr: unimplemented SPR {}", spr);
                     0
@@ -1180,6 +1409,24 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
                 crate::context::spr::XER => ctx.set_xer(val as u32),
                 crate::context::spr::LR => ctx.lr = val,
                 crate::context::spr::CTR => ctx.ctr = val,
+                crate::context::spr::DEC => ctx.dec = val as u32,
+                crate::context::spr::TBL_WRITE => {
+                    ctx.timebase = (ctx.timebase & 0xFFFF_FFFF_0000_0000) | (val & 0xFFFF_FFFF);
+                }
+                crate::context::spr::TBU_WRITE => {
+                    ctx.timebase = (ctx.timebase & 0x0000_0000_FFFF_FFFF) | ((val & 0xFFFF_FFFF) << 32);
+                }
+                crate::context::spr::VRSAVE => ctx.vrsave = val as u32,
+                // Benign writes — swallow silently to avoid false Unimplemented
+                // warnings on SPRs that have no observable effect in userspace.
+                crate::context::spr::SPRG0
+                | crate::context::spr::SPRG1
+                | crate::context::spr::SPRG2
+                | crate::context::spr::SPRG3
+                | crate::context::spr::HID0
+                | crate::context::spr::HID1
+                | crate::context::spr::DAR
+                | crate::context::spr::DSISR => {}
                 _ => {
                     tracing::warn!("mtspr: unimplemented SPR {}", spr);
                 }
@@ -1282,11 +1529,25 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         }
 
         // ===== Trap =====
-        PpcOpcode::tdi | PpcOpcode::twi | PpcOpcode::td | PpcOpcode::tw => {
-            // For now, just trace and continue
-            tracing::warn!("Trap instruction at {:#010x}: {:?}", ctx.pc, instr.opcode);
+        PpcOpcode::tw | PpcOpcode::twi | PpcOpcode::td | PpcOpcode::tdi => {
+            let a = ctx.gpr[instr.ra()];
+            let b = match instr.opcode {
+                PpcOpcode::twi | PpcOpcode::tdi => instr.simm16() as i64 as u64,
+                _ => ctx.gpr[instr.rb()],
+            };
+            let width = match instr.opcode {
+                PpcOpcode::tw | PpcOpcode::twi => trap::TrapWidth::Word,
+                _ => trap::TrapWidth::Doubleword,
+            };
+            let fired = trap::evaluate(instr.to(), a, b, width);
             ctx.pc += 4;
-            return StepResult::Trap;
+            if fired {
+                tracing::warn!(
+                    "Trap fired at {:#010x}: {:?} TO={} a={:#x} b={:#x}",
+                    ctx.pc.wrapping_sub(4), instr.opcode, instr.to(), a, b
+                );
+                return StepResult::Trap;
+            }
         }
 
         // ===== Byte-reverse loads =====
@@ -1349,19 +1610,80 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             ctx.pc += 4;
         }
         // lvewx, lvebx, lvehx all load aligned 16 bytes (per xenia reference)
-        PpcOpcode::lvewx | PpcOpcode::lvebx | PpcOpcode::lvehx => {
-            let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
-            let ea = (ea.wrapping_add(ctx.gpr[instr.rb()]) & !0xF) as u32;
-            let mut bytes = [0u8; 16];
-            for i in 0..16 { bytes[i] = mem.read_u8(ea + i as u32); }
+        PpcOpcode::lvebx => {
+            // Load 1 byte from EA into vD[EA & 0xF]. PowerISA marks the
+            // other lanes as "undefined" but real Xenon (and Canary)
+            // preserve their prior contents, so seed from vD.
+            let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+            let ea = base.wrapping_add(ctx.gpr[instr.rb()]) as u32;
+            let slot = (ea & 0xF) as usize;
+            let mut bytes = ctx.vr[instr.rd()].as_bytes();
+            bytes[slot] = mem.read_u8(ea);
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(bytes);
             ctx.pc += 4;
         }
-        PpcOpcode::stvewx | PpcOpcode::stvebx | PpcOpcode::stvehx => {
-            let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
-            let ea = (ea.wrapping_add(ctx.gpr[instr.rb()]) & !0xF) as u32;
+        PpcOpcode::lvehx => {
+            // Load a halfword from (EA & ~1) into vD at halfword slot
+            // (EA & 0xF) >> 1. Other halfword lanes preserved (see lvebx).
+            let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+            let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32;
+            let ea = ea_unaligned & !0x1u32;
+            let slot = ((ea_unaligned & 0xF) >> 1) as usize;
+            let mut bytes = ctx.vr[instr.rd()].as_bytes();
+            let h = mem.read_u16(ea);
+            bytes[slot * 2] = (h >> 8) as u8;
+            bytes[slot * 2 + 1] = (h & 0xFF) as u8;
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(bytes);
+            ctx.pc += 4;
+        }
+        PpcOpcode::lvewx => {
+            // Load a word from (EA & ~3) into vD at word slot
+            // (EA & 0xF) >> 2. Other word lanes preserved (see lvebx).
+            let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+            let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32;
+            let ea = ea_unaligned & !0x3u32;
+            let slot = ((ea_unaligned & 0xF) >> 2) as usize;
+            let mut bytes = ctx.vr[instr.rd()].as_bytes();
+            let w = mem.read_u32(ea);
+            bytes[slot * 4]     = (w >> 24) as u8;
+            bytes[slot * 4 + 1] = (w >> 16) as u8;
+            bytes[slot * 4 + 2] = (w >> 8) as u8;
+            bytes[slot * 4 + 3] = (w & 0xFF) as u8;
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(bytes);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvebx => {
+            // Store vS[EA & 0xF] (1 byte) to memory at EA.
+            let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+            let ea = base.wrapping_add(ctx.gpr[instr.rb()]) as u32;
+            let slot = (ea & 0xF) as usize;
             let bytes = ctx.vr[instr.rs()].as_bytes();
-            for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); }
+            mem.write_u8(ea, bytes[slot]);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvehx => {
+            // Store vS[slot] (1 halfword) at EA & ~1. slot = (EA & 0xF) >> 1.
+            let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+            let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32;
+            let ea = ea_unaligned & !0x1u32;
+            let slot = ((ea_unaligned & 0xF) >> 1) as usize;
+            let bytes = ctx.vr[instr.rs()].as_bytes();
+            let h = ((bytes[slot * 2] as u16) << 8) | (bytes[slot * 2 + 1] as u16);
+            mem.write_u16(ea, h);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvewx => {
+            // Store vS[slot] (1 word) at EA & ~3. slot = (EA & 0xF) >> 2.
+            let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+            let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32;
+            let ea = ea_unaligned & !0x3u32;
+            let slot = ((ea_unaligned & 0xF) >> 2) as usize;
+            let bytes = ctx.vr[instr.rs()].as_bytes();
+            let w = ((bytes[slot * 4] as u32) << 24)
+                  | ((bytes[slot * 4 + 1] as u32) << 16)
+                  | ((bytes[slot * 4 + 2] as u32) << 8)
+                  | (bytes[slot * 4 + 3] as u32);
+            mem.write_u32(ea, w);
             ctx.pc += 4;
         }
         PpcOpcode::lvxl | PpcOpcode::lvxl128 => {
@@ -1417,40 +1739,69 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             ctx.pc += 4;
         }
         PpcOpcode::vmaddfp => {
-            // vD = (vA * vC) + vB
+            // vD = (vA * vC) + vB. AltiVec unconditionally flushes denormal
+            // *inputs* to 0 regardless of VSCR[NJ] (confirmed on POWER8 hw).
             let a = ctx.vr[instr.ra()].as_f32x4();
             let b = ctx.vr[instr.rb()].as_f32x4();
             let c = ctx.vr[instr.rc()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i].mul_add(c[i], b[i]); }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                let ci = vmx::flush_denorm(c[i]);
+                r[i] = ai.mul_add(ci, bi);
+            }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
         PpcOpcode::vmaddfp128 => {
-            let a = ctx.vr[instr.va128()].as_f32x4();
-            let b = ctx.vr[instr.vb128()].as_f32x4();
-            let d = ctx.vr[instr.vd128()].as_f32x4(); // vD is also source (accumulator)
-            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = a[i].mul_add(b[i], d[i]); }
-            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
-            ctx.pc += 4;
-        }
-        PpcOpcode::vnmsubfp => {
-            // vD = -(vA * vC - vB) = vB - vA * vC
-            let a = ctx.vr[instr.ra()].as_f32x4();
-            let b = ctx.vr[instr.rb()].as_f32x4();
-            let c = ctx.vr[instr.rc()].as_f32x4();
-            let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = b[i] - a[i] * c[i]; }
-            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
-            ctx.pc += 4;
-        }
-        PpcOpcode::vnmsubfp128 => {
+            // VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator;
+            // Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through
+            // `InstrEmit_vmaddfp_` with arg order swapped so the resulting
+            // HIR computation is `VA * VB + VD`). Same unconditional denorm
+            // flush of all three inputs as scalar `vmaddfp`.
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let d = ctx.vr[instr.vd128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = d[i] - a[i] * b[i]; }
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                let di = vmx::flush_denorm(d[i]);
+                r[i] = ai.mul_add(bi, di);
+            }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vnmsubfp => {
+            // vD = -(vA * vC - vB) = vB - vA * vC. Same denorm-flush rule as vmaddfp.
+            let a = ctx.vr[instr.ra()].as_f32x4();
+            let b = ctx.vr[instr.rb()].as_f32x4();
+            let c = ctx.vr[instr.rc()].as_f32x4();
+            let mut r = [0f32; 4];
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                let ci = vmx::flush_denorm(c[i]);
+                r[i] = bi - ai * ci;
+            }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vnmsubfp128 => {
+            // VMX128 form: vD <- -((vA * vB) - vD) = vD - (vA * vB). Canary
+            // routes through `InstrEmit_vnmsubfp_` with the same arg-swap,
+            // which flushes all inputs unconditionally.
+            let a = ctx.vr[instr.va128()].as_f32x4();
+            let b = ctx.vr[instr.vb128()].as_f32x4();
+            let d = ctx.vr[instr.vd128()].as_f32x4();
+            let mut r = [0f32; 4];
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                let di = vmx::flush_denorm(d[i]);
+                r[i] = di - ai * bi;
+            }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -1466,7 +1817,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let a = ctx.vr[instr.ra()].as_f32x4();
             let b = ctx.vr[instr.rb()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = if a[i] > b[i] { a[i] } else { b[i] }; }
+            for i in 0..4 { r[i] = vmx::max_nan(a[i], b[i]); }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -1474,7 +1825,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = if a[i] > b[i] { a[i] } else { b[i] }; }
+            for i in 0..4 { r[i] = vmx::max_nan(a[i], b[i]); }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -1482,7 +1833,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let a = ctx.vr[instr.ra()].as_f32x4();
             let b = ctx.vr[instr.rb()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = if a[i] < b[i] { a[i] } else { b[i] }; }
+            for i in 0..4 { r[i] = vmx::min_nan(a[i], b[i]); }
             ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -1490,7 +1841,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let a = ctx.vr[instr.va128()].as_f32x4();
             let b = ctx.vr[instr.vb128()].as_f32x4();
             let mut r = [0f32; 4];
-            for i in 0..4 { r[i] = if a[i] < b[i] { a[i] } else { b[i] }; }
+            for i in 0..4 { r[i] = vmx::min_nan(a[i], b[i]); }
             ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
             ctx.pc += 4;
         }
@@ -1841,14 +2192,15 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             ctx.pc += 4;
         }
 
-        // VMX: MFVSCR/MTVSCR
+        // VMX: MFVSCR/MTVSCR — VSCR lives in word 3; only NJ (bit 16) and
+        // SAT (bit 31) are defined. Canary stores the full Vec128 so we do
+        // the same: mfvscr copies the register, mtvscr overwrites it.
         PpcOpcode::mfvscr => {
-            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4(0, 0, 0, ctx.vscr_sat as u32);
+            ctx.vr[instr.rd()] = ctx.vscr;
             ctx.pc += 4;
         }
         PpcOpcode::mtvscr => {
-            let val = ctx.vr[instr.rb()].as_u32x4();
-            ctx.vscr_sat = (val[3] & 1) as u8;
+            ctx.vscr = ctx.vr[instr.rb()];
             ctx.pc += 4;
         }
 
@@ -1888,89 +2240,176 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
 
         // ===== FPU: Arithmetic =====
         PpcOpcode::faddx => {
-            ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] + ctx.fpr[instr.rb()];
+            let a = ctx.fpr[instr.ra()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_add(ctx, a, b, false);
+            let result = a + b;
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::faddsx => {
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] + ctx.fpr[instr.rb()]);
+            let a = ctx.fpr[instr.ra()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_add(ctx, a, b, false);
+            let result = to_single(ctx, a + b);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fsubx => {
-            ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] - ctx.fpr[instr.rb()];
+            let a = ctx.fpr[instr.ra()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_add(ctx, a, b, true);
+            let result = a - b;
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fsubsx => {
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] - ctx.fpr[instr.rb()]);
+            let a = ctx.fpr[instr.ra()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_add(ctx, a, b, true);
+            let result = to_single(ctx, a - b);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fmulx => {
             // A-form: frD = frA * frC (frC is at rc() field, bits 21-25)
-            ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] * ctx.fpr[instr.rc()];
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = a * c;
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fmulsx => {
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] * ctx.fpr[instr.rc()]);
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = to_single(ctx, a * c);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fdivx => {
-            ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] / ctx.fpr[instr.rb()];
+            let a = ctx.fpr[instr.ra()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_div(ctx, a, b);
+            fpscr::check_zero_divide(ctx, a, b);
+            let result = a / b;
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && b != 0.0);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fdivsx => {
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] / ctx.fpr[instr.rb()]);
+            let a = ctx.fpr[instr.ra()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_div(ctx, a, b);
+            fpscr::check_zero_divide(ctx, a, b);
+            let result = to_single(ctx, a / b);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && b != 0.0);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
 
         // ===== FPU: Multiply-Add =====
         PpcOpcode::fmaddx => {
-            // frD = (frA * frC) + frB
-            ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()]);
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            fpscr::check_invalid_add(ctx, a * c, b, false);
+            let result = a.mul_add(c, b);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fmaddsx => {
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()]));
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = to_single(ctx, a.mul_add(c, b));
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fmsubx => {
-            // frD = (frA * frC) - frB
-            ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()]);
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = a.mul_add(c, -b);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fmsubsx => {
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()]));
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = to_single(ctx, a.mul_add(c, -b));
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fnmaddx => {
-            // frD = -((frA * frC) + frB)
-            ctx.fpr[instr.rd()] = -(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()]));
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = -(a.mul_add(c, b));
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fnmaddsx => {
-            ctx.fpr[instr.rd()] = to_single(-(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()])));
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = to_single(ctx, -(a.mul_add(c, b)));
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fnmsubx => {
-            // frD = -((frA * frC) - frB)
-            ctx.fpr[instr.rd()] = -(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()]));
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = -(a.mul_add(c, -b));
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fnmsubsx => {
-            ctx.fpr[instr.rd()] = to_single(-(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()])));
+            let a = ctx.fpr[instr.ra()];
+            let c = ctx.fpr[instr.rc()];
+            let b = ctx.fpr[instr.rb()];
+            fpscr::check_invalid_mul(ctx, a, c);
+            let result = to_single(ctx, -(a.mul_add(c, -b)));
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
@@ -2011,39 +2450,87 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
 
         // ===== FPU: Square root / Reciprocal =====
         PpcOpcode::fsqrtx => {
-            ctx.fpr[instr.rd()] = ctx.fpr[instr.rb()].sqrt();
+            let b = ctx.fpr[instr.rb()];
+            // sqrt of negative (non-zero) is invalid operation → VXSQRT.
+            if b.is_sign_negative() && b != 0.0 && !b.is_nan() {
+                fpscr::set_exception(ctx, fpscr::VXSQRT);
+            }
+            if fpscr::is_snan(b) {
+                fpscr::set_exception(ctx, fpscr::VXSNAN);
+            }
+            let result = b.sqrt();
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fsqrtsx => {
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.rb()].sqrt());
+            let b = ctx.fpr[instr.rb()];
+            if b.is_sign_negative() && b != 0.0 && !b.is_nan() {
+                fpscr::set_exception(ctx, fpscr::VXSQRT);
+            }
+            if fpscr::is_snan(b) {
+                fpscr::set_exception(ctx, fpscr::VXSNAN);
+            }
+            let result = to_single(ctx, b.sqrt());
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fresx => {
             // Single-precision reciprocal estimate: frD = 1.0 / frB
-            ctx.fpr[instr.rd()] = to_single(1.0 / ctx.fpr[instr.rb()]);
+            let b = ctx.fpr[instr.rb()];
+            if b == 0.0 {
+                fpscr::set_exception(ctx, fpscr::ZX);
+            }
+            if fpscr::is_snan(b) {
+                fpscr::set_exception(ctx, fpscr::VXSNAN);
+            }
+            let result = to_single(ctx, 1.0 / b);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, b.is_finite() && b != 0.0);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::frsqrtex => {
             // Reciprocal square root estimate: frD = 1.0 / sqrt(frB)
-            ctx.fpr[instr.rd()] = 1.0 / ctx.fpr[instr.rb()].sqrt();
+            let b = ctx.fpr[instr.rb()];
+            if b == 0.0 {
+                fpscr::set_exception(ctx, fpscr::ZX);
+            }
+            if b.is_sign_negative() && b != 0.0 && !b.is_nan() {
+                fpscr::set_exception(ctx, fpscr::VXSQRT);
+            }
+            if fpscr::is_snan(b) {
+                fpscr::set_exception(ctx, fpscr::VXSNAN);
+            }
+            let result = 1.0 / b.sqrt();
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, b.is_finite() && b > 0.0);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
 
         // ===== FPU: Rounding/Conversion =====
         PpcOpcode::frspx => {
-            // Round to single precision
-            ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.rb()]);
+            // Round to single precision honouring FPSCR[RN]
+            let b = ctx.fpr[instr.rb()];
+            if fpscr::is_snan(b) {
+                fpscr::set_exception(ctx, fpscr::VXSNAN);
+            }
+            let result = to_single(ctx, b);
+            ctx.fpr[instr.rd()] = result;
+            fpscr::update_after_op(ctx, result, b.is_finite());
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fcfidx => {
-            // Convert from integer doubleword: frD = (double)(int64_t)frD_as_bits
+            // Convert from integer doubleword: frD = (double)(int64_t)frB_as_bits
             let bits = ctx.fpr[instr.rb()].to_bits();
-            ctx.fpr[instr.rd()] = bits as i64 as f64;
+            let result = (bits as i64) as f64;
+            ctx.fpr[instr.rd()] = result;
+            fpscr::set_fprf(ctx, fpscr::classify_fprf(result));
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
@@ -2051,10 +2538,16 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             // Convert to integer doubleword (round per FPSCR[RN])
             let val = ctx.fpr[instr.rb()];
             let result = if val.is_nan() {
+                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
+                0x8000_0000_0000_0000u64
+            } else if val >= (i64::MAX as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
+                0x7FFF_FFFF_FFFF_FFFFu64
+            } else if val < (i64::MIN as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
                 0x8000_0000_0000_0000u64
             } else {
-                let rounded = val.round();
-                (rounded as i64) as u64
+                fpscr::round_to_i64(ctx, val) as u64
             };
             ctx.fpr[instr.rd()] = f64::from_bits(result);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
@@ -2064,9 +2557,16 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             // Convert to integer doubleword (round toward zero)
             let val = ctx.fpr[instr.rb()];
             let result = if val.is_nan() {
+                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
+                0x8000_0000_0000_0000u64
+            } else if val >= (i64::MAX as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
+                0x7FFF_FFFF_FFFF_FFFFu64
+            } else if val < (i64::MIN as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
                 0x8000_0000_0000_0000u64
             } else {
-                (val as i64) as u64
+                (val.trunc() as i64) as u64
             };
             ctx.fpr[instr.rd()] = f64::from_bits(result);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
@@ -2075,27 +2575,38 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         PpcOpcode::fctiwx => {
             // Convert to integer word (round per FPSCR[RN])
             let val = ctx.fpr[instr.rb()];
-            let result = if val.is_nan() {
-                0x8000_0000u64
+            let result_u32: u32 = if val.is_nan() {
+                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
+                0x8000_0000
+            } else if val > (i32::MAX as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
+                0x7FFF_FFFF
+            } else if val < (i32::MIN as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
+                0x8000_0000
             } else {
-                let rounded = val.round();
-                let clamped = rounded.clamp(i32::MIN as f64, i32::MAX as f64);
-                (clamped as i32 as u32) as u64
+                fpscr::round_to_i32(ctx, val) as u32
             };
-            ctx.fpr[instr.rd()] = f64::from_bits(result);
+            ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::fctiwzx => {
-            // Convert to integer word (round toward zero) -- most common
+            // Convert to integer word (round toward zero)
             let val = ctx.fpr[instr.rb()];
-            let result = if val.is_nan() {
-                0x8000_0000u64
+            let result_u32: u32 = if val.is_nan() {
+                fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 });
+                0x8000_0000
+            } else if val > (i32::MAX as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
+                0x7FFF_FFFF
+            } else if val < (i32::MIN as f64) {
+                fpscr::set_exception(ctx, fpscr::VXCVI);
+                0x8000_0000
             } else {
-                let clamped = val.clamp(i32::MIN as f64, i32::MAX as f64);
-                (clamped as i32 as u32) as u64
+                val.trunc() as i32 as u32
             };
-            ctx.fpr[instr.rd()] = f64::from_bits(result);
+            ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
@@ -2106,54 +2617,60 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
             let frb = ctx.fpr[instr.rb()];
             let crfd = instr.crfd();
             if fra.is_nan() || frb.is_nan() {
-                ctx.cr[crfd].lt = false;
-                ctx.cr[crfd].gt = false;
-                ctx.cr[crfd].eq = false;
-                ctx.cr[crfd].so = true;
+                ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: false, so: true };
+                // fcmpu: VXSNAN on SNaN input; no VXVC even on QNaN.
+                if fpscr::is_snan(fra) || fpscr::is_snan(frb) {
+                    fpscr::set_exception(ctx, fpscr::VXSNAN);
+                }
             } else if fra < frb {
-                ctx.cr[crfd].lt = true;
-                ctx.cr[crfd].gt = false;
-                ctx.cr[crfd].eq = false;
-                ctx.cr[crfd].so = false;
+                ctx.cr[crfd] = crate::context::CrField { lt: true, gt: false, eq: false, so: false };
             } else if fra > frb {
-                ctx.cr[crfd].lt = false;
-                ctx.cr[crfd].gt = true;
-                ctx.cr[crfd].eq = false;
-                ctx.cr[crfd].so = false;
+                ctx.cr[crfd] = crate::context::CrField { lt: false, gt: true, eq: false, so: false };
             } else {
-                ctx.cr[crfd].lt = false;
-                ctx.cr[crfd].gt = false;
-                ctx.cr[crfd].eq = true;
-                ctx.cr[crfd].so = false;
+                ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: true, so: false };
             }
+            // Also mirror the comparison result into FPSCR[FPRF (FL/FG/FE/FU)].
+            let fprf = if fra.is_nan() || frb.is_nan() {
+                0b0_0001
+            } else if fra < frb {
+                0b0_1000
+            } else if fra > frb {
+                0b0_0100
+            } else {
+                0b0_0010
+            };
+            fpscr::set_fprf(ctx, fprf);
             ctx.pc += 4;
         }
         PpcOpcode::fcmpo => {
-            // Same as fcmpu but sets FPSCR exception bits for QNaN (not modeled yet)
+            // Ordered compare: like fcmpu but also sets VXVC on QNaN (or VXSNAN on SNaN).
             let fra = ctx.fpr[instr.ra()];
             let frb = ctx.fpr[instr.rb()];
             let crfd = instr.crfd();
             if fra.is_nan() || frb.is_nan() {
-                ctx.cr[crfd].lt = false;
-                ctx.cr[crfd].gt = false;
-                ctx.cr[crfd].eq = false;
-                ctx.cr[crfd].so = true;
+                ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: false, so: true };
+                if fpscr::is_snan(fra) || fpscr::is_snan(frb) {
+                    fpscr::set_exception(ctx, fpscr::VXSNAN | fpscr::VXVC);
+                } else {
+                    fpscr::set_exception(ctx, fpscr::VXVC);
+                }
             } else if fra < frb {
-                ctx.cr[crfd].lt = true;
-                ctx.cr[crfd].gt = false;
-                ctx.cr[crfd].eq = false;
-                ctx.cr[crfd].so = false;
+                ctx.cr[crfd] = crate::context::CrField { lt: true, gt: false, eq: false, so: false };
             } else if fra > frb {
-                ctx.cr[crfd].lt = false;
-                ctx.cr[crfd].gt = true;
-                ctx.cr[crfd].eq = false;
-                ctx.cr[crfd].so = false;
+                ctx.cr[crfd] = crate::context::CrField { lt: false, gt: true, eq: false, so: false };
             } else {
-                ctx.cr[crfd].lt = false;
-                ctx.cr[crfd].gt = false;
-                ctx.cr[crfd].eq = true;
-                ctx.cr[crfd].so = false;
+                ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: true, so: false };
             }
+            let fprf = if fra.is_nan() || frb.is_nan() {
+                0b0_0001
+            } else if fra < frb {
+                0b0_1000
+            } else if fra > frb {
+                0b0_0100
+            } else {
+                0b0_0010
+            };
+            fpscr::set_fprf(ctx, fprf);
             ctx.pc += 4;
         }
 
@@ -2166,7 +2683,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         }
         PpcOpcode::mtfsfx => {
             // Move to FPSCR fields: fm mask in bits 7-14, frB value
-            let fm = ((instr.raw >> 17) & 0xFF) as u32;
+            let fm = (instr.raw >> 17) & 0xFF;
             let val = ctx.fpr[instr.rb()].to_bits() as u32;
             let mut mask = 0u32;
             for i in 0..8 {
@@ -2181,27 +2698,1559 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
         PpcOpcode::mtfsb0x => {
             // Clear FPSCR bit crbd
             let bit = instr.crbd();
-            ctx.fpscr &= !(1 << (31 - bit as u32));
+            ctx.fpscr &= !(1 << (31 - bit));
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::mtfsb1x => {
             // Set FPSCR bit crbd
             let bit = instr.crbd();
-            ctx.fpscr |= 1 << (31 - bit as u32);
+            ctx.fpscr |= 1 << (31 - bit);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
         PpcOpcode::mtfsfix => {
             // Move to FPSCR field immediate: crfD = IMM (4 bits)
             let crfd = instr.crfd();
-            let imm = ((instr.raw >> 12) & 0xF) as u32;
+            let imm = (instr.raw >> 12) & 0xF;
             let shift = 28 - crfd as u32 * 4;
             ctx.fpscr = (ctx.fpscr & !(0xF << shift)) | (imm << shift);
             if instr.rc_bit() { update_cr1_from_fpscr(ctx); }
             ctx.pc += 4;
         }
 
+        // ═════════════════════════════════════════════════════════════════
+        // §4b — Unaligned vector load/store
+        // ═════════════════════════════════════════════════════════════════
+        // lvlx / lvlx128 / lvlxl / lvlxl128: load left-aligned from EA.
+        PpcOpcode::lvlx | PpcOpcode::lvlxl => {
+            let ea = ea_indexed(ctx, instr);
+            ctx.vr[instr.rd()] = crate::vmx::load_vector_left(mem, ea);
+            ctx.pc += 4;
+        }
+        PpcOpcode::lvlx128 | PpcOpcode::lvlxl128 => {
+            let ea = ea_indexed(ctx, instr);
+            ctx.vr[instr.vd128()] = crate::vmx::load_vector_left(mem, ea);
+            ctx.pc += 4;
+        }
+        PpcOpcode::lvrx | PpcOpcode::lvrxl => {
+            let ea = ea_indexed(ctx, instr);
+            ctx.vr[instr.rd()] = crate::vmx::load_vector_right(mem, ea);
+            ctx.pc += 4;
+        }
+        PpcOpcode::lvrx128 | PpcOpcode::lvrxl128 => {
+            let ea = ea_indexed(ctx, instr);
+            ctx.vr[instr.vd128()] = crate::vmx::load_vector_right(mem, ea);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvlx | PpcOpcode::stvlxl => {
+            let ea = ea_indexed(ctx, instr);
+            crate::vmx::store_vector_left(mem, ea, ctx.vr[instr.rs()]);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvlx128 | PpcOpcode::stvlxl128 => {
+            let ea = ea_indexed(ctx, instr);
+            crate::vmx::store_vector_left(mem, ea, ctx.vr[instr.vs128()]);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvrx | PpcOpcode::stvrxl => {
+            let ea = ea_indexed(ctx, instr);
+            crate::vmx::store_vector_right(mem, ea, ctx.vr[instr.rs()]);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvrx128 | PpcOpcode::stvrxl128 => {
+            let ea = ea_indexed(ctx, instr);
+            crate::vmx::store_vector_right(mem, ea, ctx.vr[instr.vs128()]);
+            ctx.pc += 4;
+        }
+        // lvewx128 / stvewx128: VMX128 element-indexed 32-bit load/store.
+        // Like lvewx the whole 16 bytes at the aligned EA go into VD; the
+        // element-of-interest is implied by EA's low bits.
+        PpcOpcode::lvewx128 => {
+            let ea = ea_indexed(ctx, instr) & !0xF;
+            let mut bytes = [0u8; 16];
+            for i in 0..16 { bytes[i] = mem.read_u8(ea + i as u32); }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_bytes(bytes);
+            ctx.pc += 4;
+        }
+        PpcOpcode::stvewx128 => {
+            let ea = ea_indexed(ctx, instr) & !0xF;
+            let bytes = ctx.vr[instr.vs128()].as_bytes();
+            for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); }
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4a — VMX integer add/sub (modulo and saturating), mul, avg, sum
+        // ═════════════════════════════════════════════════════════════════
+        // -------- modulo add/sub (byte/halfword/word) --------
+        PpcOpcode::vaddubm => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = a[i].wrapping_add(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsububm => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = a[i].wrapping_sub(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vadduhm => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[i].wrapping_add(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsubuhm => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[i].wrapping_sub(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        // vadduwm / vsubuwm are implemented above (modulo word add/sub arms).
+
+        // -------- saturating add/sub (signed + unsigned) --------
+        PpcOpcode::vaddubs => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            let mut sat = false;
+            for i in 0..16 {
+                let (v, s) = crate::vmx::sat_add_u8(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsububs => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16]; let mut sat = false;
+            for i in 0..16 {
+                let (v, s) = crate::vmx::sat_sub_u8(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vaddsbs => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0i8; 16]; let mut sat = false;
+            for i in 0..16 {
+                let (v, s) = crate::vmx::sat_add_i8(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsubsbs => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0i8; 16]; let mut sat = false;
+            for i in 0..16 {
+                let (v, s) = crate::vmx::sat_sub_i8(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vadduhs => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8]; let mut sat = false;
+            for i in 0..8 {
+                let (v, s) = crate::vmx::sat_add_u16(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsubuhs => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8]; let mut sat = false;
+            for i in 0..8 {
+                let (v, s) = crate::vmx::sat_sub_u16(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vaddshs => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i16; 8]; let mut sat = false;
+            for i in 0..8 {
+                let (v, s) = crate::vmx::sat_add_i16(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsubshs => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i16; 8]; let mut sat = false;
+            for i in 0..8 {
+                let (v, s) = crate::vmx::sat_sub_i16(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vadduws => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::sat_add_u32(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsubuws => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::sat_sub_u32(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vaddsws => {
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::sat_add_i32(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsubsws => {
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::sat_sub_i32(a[i], b[i]);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+
+        // -------- vaddcuw / vsubcuw: per-lane carry / borrow out --------
+        PpcOpcode::vaddcuw => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 {
+                let (_, c) = a[i].overflowing_add(b[i]);
+                r[i] = if c { 1 } else { 0 };
+            }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsubcuw => {
+            // "Subtract Carryout": r = 1 if a >= b (no borrow), 0 otherwise.
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = if a[i] >= b[i] { 1 } else { 0 }; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+
+        // -------- averages --------
+        PpcOpcode::vavgub => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = crate::vmx::avg_u8(a[i], b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vavgsb => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0i8; 16];
+            for i in 0..16 { r[i] = crate::vmx::avg_i8(a[i], b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vavguh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = crate::vmx::avg_u16(a[i], b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vavgsh => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = crate::vmx::avg_i16(a[i], b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vavguw => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = crate::vmx::avg_u32(a[i], b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vavgsw => {
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 { r[i] = crate::vmx::avg_i32(a[i], b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+
+        // -------- multiplies (even / odd lanes — see §5 hazard note) --------
+        // vmuleub: even u8 lanes (BE index 0,2,4,...,14) → u16 lanes.
+        PpcOpcode::vmuleub => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[2 * i] as u16 * b[2 * i] as u16; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmuloub => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[2 * i + 1] as u16 * b[2 * i + 1] as u16; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmulesb => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = a[2 * i] as i16 * b[2 * i] as i16; }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmulosb => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = a[2 * i + 1] as i16 * b[2 * i + 1] as i16; }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmuleuh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = a[2 * i] as u32 * b[2 * i] as u32; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmulouh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = a[2 * i + 1] as u32 * b[2 * i + 1] as u32; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmulesh => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 { r[i] = a[2 * i] as i32 * b[2 * i] as i32; }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmulosh => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 { r[i] = a[2 * i + 1] as i32 * b[2 * i + 1] as i32; }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+
+        // -------- multiply-add halfword (saturating) --------
+        PpcOpcode::vmhaddshs => {
+            // vD[i] = sat_i16((vA[i] * vB[i]) >> 15 + vC[i])
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let c = crate::vmx::as_i16x8(ctx.vr[instr.rc()]);
+            let mut r = [0i16; 8]; let mut sat = false;
+            for i in 0..8 {
+                let prod = (a[i] as i32 * b[i] as i32) >> 15;
+                let (v, s) = crate::vmx::sat_i32_to_i16(prod + c[i] as i32);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmhraddshs => {
+            // Rounded multiply-add: (vA[i]*vB[i] + 0x4000) >> 15 + vC[i], saturating.
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let c = crate::vmx::as_i16x8(ctx.vr[instr.rc()]);
+            let mut r = [0i16; 8]; let mut sat = false;
+            for i in 0..8 {
+                let prod = (a[i] as i32 * b[i] as i32 + 0x4000) >> 15;
+                let (v, s) = crate::vmx::sat_i32_to_i16(prod + c[i] as i32);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmladduhm => {
+            // Multiply-low add (modulo): vD[i] = u16(vA[i] * vB[i] + vC[i]).
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let c = ctx.vr[instr.rc()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 {
+                r[i] = a[i].wrapping_mul(b[i]).wrapping_add(c[i]);
+            }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+
+        // -------- VMX sum-of-products --------
+        // vmsumubm: vD[i:u32] = sum over j in [0..4] of vA[4i+j:u8] * vB[4i+j:u8] + vC[i].
+        PpcOpcode::vmsumubm => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let c = ctx.vr[instr.rc()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 {
+                let mut s = c[i];
+                for j in 0..4 {
+                    s = s.wrapping_add(a[4*i+j] as u32 * b[4*i+j] as u32);
+                }
+                r[i] = s;
+            }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmsummbm => {
+            // signed bytes × unsigned bytes, signed accumulator
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let c = crate::vmx::as_i32x4(ctx.vr[instr.rc()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 {
+                let mut s = c[i];
+                for j in 0..4 {
+                    s = s.wrapping_add(a[4*i+j] as i32 * b[4*i+j] as i32);
+                }
+                r[i] = s;
+            }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmsumuhm => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let c = ctx.vr[instr.rc()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 {
+                let s = (a[2*i] as u32 * b[2*i] as u32)
+                    .wrapping_add(a[2*i+1] as u32 * b[2*i+1] as u32)
+                    .wrapping_add(c[i]);
+                r[i] = s;
+            }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmsumuhs => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let c = ctx.vr[instr.rc()].as_u32x4();
+            let mut r = [0u32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let s = (a[2*i] as u64 * b[2*i] as u64)
+                    + (a[2*i+1] as u64 * b[2*i+1] as u64)
+                    + c[i] as u64;
+                let (v, overflow) = if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) };
+                r[i] = v; sat |= overflow;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmsumshm => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let c = crate::vmx::as_i32x4(ctx.vr[instr.rc()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 {
+                let s = (a[2*i] as i32 * b[2*i] as i32)
+                    .wrapping_add(a[2*i+1] as i32 * b[2*i+1] as i32)
+                    .wrapping_add(c[i]);
+                r[i] = s;
+            }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmsumshs => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let c = crate::vmx::as_i32x4(ctx.vr[instr.rc()]);
+            let mut r = [0i32; 4]; let mut sat = false;
+            for i in 0..4 {
+                // Running-sum saturation: accumulate in i64, clamp once at end.
+                let s = (a[2*i] as i64 * b[2*i] as i64)
+                    + (a[2*i+1] as i64 * b[2*i+1] as i64)
+                    + c[i] as i64;
+                let (v, o) = crate::vmx::sat_i64_to_i32(s);
+                r[i] = v; sat |= o;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+
+        // -------- VMX sum-across --------
+        PpcOpcode::vsumsws => {
+            // vD[3] = sat_i32(vC[3] + sum over i in 0..4 of vA[i])
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let s = a.iter().map(|&x| x as i64).sum::<i64>() + c[3] as i64;
+            let (v, sat) = crate::vmx::sat_i64_to_i32(s);
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4([0, 0, 0, v]);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsum2sws => {
+            // Two 2-word partial sums at lanes 1 and 3.
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let s0 = a[0] as i64 + a[1] as i64 + c[1] as i64;
+            let s1 = a[2] as i64 + a[3] as i64 + c[3] as i64;
+            let (v0, sat0) = crate::vmx::sat_i64_to_i32(s0);
+            let (v1, sat1) = crate::vmx::sat_i64_to_i32(s1);
+            if sat0 | sat1 { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4([0, v0, 0, v1]);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsum4sbs => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let s = a[4*i] as i64 + a[4*i+1] as i64 + a[4*i+2] as i64 + a[4*i+3] as i64 + c[i] as i64;
+                let (v, o) = crate::vmx::sat_i64_to_i32(s);
+                r[i] = v; sat |= o;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsum4ubs => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let c = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let s = a[4*i] as u64 + a[4*i+1] as u64 + a[4*i+2] as u64 + a[4*i+3] as u64 + c[i] as u64;
+                let (v, o) = if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) };
+                r[i] = v; sat |= o;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsum4shs => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let s = a[2*i] as i64 + a[2*i+1] as i64 + c[i] as i64;
+                let (v, o) = crate::vmx::sat_i64_to_i32(s);
+                r[i] = v; sat |= o;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4c — VMX integer compares (all set 0xFF/0xFFFF/0xFFFFFFFF per lane)
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::vcmpequb => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = if a[i] == b[i] { 0xFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_bytes(r);
+            if instr.rc_bit() {
+                let (t, f) = crate::vmx::cr6_flags_from_mask(v);
+                ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
+            }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcmpequh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = if a[i] == b[i] { 0xFFFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_u16x8_array(r);
+            if instr.rc_bit() {
+                let (t, f) = crate::vmx::cr6_flags_from_mask(v);
+                ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
+            }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcmpgtub => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = if a[i] > b[i] { 0xFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_bytes(r);
+            if instr.rc_bit() {
+                let (t, f) = crate::vmx::cr6_flags_from_mask(v);
+                ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
+            }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcmpgtsb => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = if a[i] > b[i] { 0xFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_bytes(r);
+            if instr.rc_bit() {
+                let (t, f) = crate::vmx::cr6_flags_from_mask(v);
+                ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
+            }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcmpgtuh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = if a[i] > b[i] { 0xFFFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_u16x8_array(r);
+            if instr.rc_bit() {
+                let (t, f) = crate::vmx::cr6_flags_from_mask(v);
+                ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
+            }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcmpgtsh => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = if a[i] > b[i] { 0xFFFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_u16x8_array(r);
+            if instr.rc_bit() {
+                let (t, f) = crate::vmx::cr6_flags_from_mask(v);
+                ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false };
+            }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcmpgtuw => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = if a[i] > b[i] { 0xFFFFFFFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_u32x4_array(r);
+            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcmpgtsw => {
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = if a[i] > b[i] { 0xFFFFFFFF } else { 0 }; }
+            let v = xenia_types::Vec128::from_u32x4_array(r);
+            if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); }
+            ctx.vr[instr.rd()] = v;
+            ctx.pc += 4;
+        }
+        // vcmpbfp(128): set upper/lower nibbles per lane based on bounds test.
+        PpcOpcode::vcmpbfp | PpcOpcode::vcmpbfp128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vcmpbfp128);
+            let (ra, rb, rd) = if is_128 {
+                (instr.va128(), instr.vb128(), instr.vd128())
+            } else {
+                (instr.ra(), instr.rb(), instr.rd())
+            };
+            let a = ctx.vr[ra].as_f32x4();
+            let b = ctx.vr[rb].as_f32x4();
+            let mut r = [0u32; 4];
+            let mut any_out = false;
+            for i in 0..4 {
+                let mut lane: u32 = 0;
+                if a[i].is_nan() || b[i].is_nan() || a[i] > b[i] { lane |= 0x8000_0000; any_out = true; }
+                if a[i].is_nan() || b[i].is_nan() || a[i] < -b[i] { lane |= 0x4000_0000; any_out = true; }
+                r[i] = lane;
+            }
+            if instr.rc_bit() {
+                ctx.cr[6] = crate::context::CrField {
+                    lt: false, gt: false, eq: !any_out, so: false,
+                };
+            }
+            ctx.vr[rd] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4d — VMX shifts and rotates
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::vslb => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = a[i] << (b[i] & 7); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsrb => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = a[i] >> (b[i] & 7); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsrab => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0i8; 16];
+            for i in 0..16 { r[i] = a[i] >> (b[i] & 7); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vrlb => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = a[i].rotate_left((b[i] & 7) as u32); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vslh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[i] << (b[i] & 0xF); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsrh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[i] >> (b[i] & 0xF); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsrah => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = a[i] >> (b[i] & 0xF); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vrlh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[i].rotate_left((b[i] & 0xF) as u32); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        // vslw / vsrw / vsraw / vrlw (word shifts) are implemented above via
+        // vmx_reg_triple — skip here.
+
+        // Full 128-bit bit shifts (vsl/vsr): shift by the low 3 bits of vB[15].
+        PpcOpcode::vsl => {
+            let a = u128::from_be_bytes(ctx.vr[instr.ra()].as_bytes());
+            let shift = (ctx.vr[instr.rb()].as_bytes()[15] & 7) as u32;
+            let r = if shift == 0 { a } else { a << shift };
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r.to_be_bytes());
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsr => {
+            let a = u128::from_be_bytes(ctx.vr[instr.ra()].as_bytes());
+            let shift = (ctx.vr[instr.rb()].as_bytes()[15] & 7) as u32;
+            let r = if shift == 0 { a } else { a >> shift };
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r.to_be_bytes());
+            ctx.pc += 4;
+        }
+        // vslo/vsro: 128-bit octet (byte) shift. vB[15] & 0x78 gives bit count / 8 * 8.
+        PpcOpcode::vslo | PpcOpcode::vslo128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vslo128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = u128::from_be_bytes(ctx.vr[ra].as_bytes());
+            let nbytes = ((ctx.vr[rb].as_bytes()[15] >> 3) & 0xF) as u32;
+            let r = if nbytes == 0 { a } else { a << (nbytes * 8) };
+            ctx.vr[rd] = xenia_types::Vec128::from_bytes(r.to_be_bytes());
+            ctx.pc += 4;
+        }
+        PpcOpcode::vsro | PpcOpcode::vsro128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vsro128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = u128::from_be_bytes(ctx.vr[ra].as_bytes());
+            let nbytes = ((ctx.vr[rb].as_bytes()[15] >> 3) & 0xF) as u32;
+            let r = if nbytes == 0 { a } else { a >> (nbytes * 8) };
+            ctx.vr[rd] = xenia_types::Vec128::from_bytes(r.to_be_bytes());
+            ctx.pc += 4;
+        }
+        // vrlimi128: rotate-left-immediate then partial-merge into vD.
+        // Field layout (from canary ppc_decode_data.cc VX128_4):
+        //   imm = bits(22..=23,28..=29) for shift, mask = bits(24..=27)
+        // Simplified semantics: r = vB rotated left by `shift` words, merged
+        // into vD using a per-word `mask` (mask bit N == 1 ⇒ use vD[N], else
+        // use rotated[N]). Titles generally use mask=0xF (copy-all) which
+        // makes this behave like a plain word rotate.
+        PpcOpcode::vrlimi128 => {
+            let shift = ((instr.raw >> 16) & 0x3) as usize;
+            let mask = (instr.raw >> 2) & 0xF; // VX128_4 "fmask"
+            let b = ctx.vr[instr.vb128()].as_u32x4();
+            let d = ctx.vr[instr.vd128()].as_u32x4();
+            let rot = [b[shift % 4], b[(shift + 1) % 4], b[(shift + 2) % 4], b[(shift + 3) % 4]];
+            let mut r = [0u32; 4];
+            for i in 0..4 {
+                // mask bit 3 corresponds to word 0 (BE-first). Use rot when
+                // the corresponding mask bit is set.
+                let use_rot = (mask >> (3 - i)) & 1 == 1;
+                r[i] = if use_rot { rot[i] } else { d[i] };
+            }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4e — VMX merge (interleave high / low halves)
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::vmrghb => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..8 { r[2*i] = a[i]; r[2*i+1] = b[i]; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmrglb => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..8 { r[2*i] = a[8+i]; r[2*i+1] = b[8+i]; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmrghh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..4 { r[2*i] = a[i]; r[2*i+1] = b[i]; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmrglh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..4 { r[2*i] = a[4+i]; r[2*i+1] = b[4+i]; }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4f — VMX pack / unpack (saturating and modulo + D3D + 5-6-5)
+        // ═════════════════════════════════════════════════════════════════
+        // ---- Pack modulo (truncate) ----
+        PpcOpcode::vpkuhum | PpcOpcode::vpkuhum128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkuhum128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = ctx.vr[ra].as_u16x8();
+            let b = ctx.vr[rb].as_u16x8();
+            let mut r = [0u8; 16];
+            for i in 0..8 { r[i] = a[i] as u8; }
+            for i in 0..8 { r[8 + i] = b[i] as u8; }
+            ctx.vr[rd] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vpkuwum | PpcOpcode::vpkuwum128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkuwum128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = ctx.vr[ra].as_u32x4();
+            let b = ctx.vr[rb].as_u32x4();
+            let mut r = [0u16; 8];
+            for i in 0..4 { r[i] = a[i] as u16; }
+            for i in 0..4 { r[4 + i] = b[i] as u16; }
+            ctx.vr[rd] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        // ---- Pack with saturation ----
+        PpcOpcode::vpkuhus | PpcOpcode::vpkuhus128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkuhus128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = ctx.vr[ra].as_u16x8();
+            let b = ctx.vr[rb].as_u16x8();
+            let mut r = [0u8; 16]; let mut sat = false;
+            for i in 0..8 { let (v, s) = crate::vmx::sat_u16_to_u8(a[i]); r[i] = v; sat |= s; }
+            for i in 0..8 { let (v, s) = crate::vmx::sat_u16_to_u8(b[i]); r[8 + i] = v; sat |= s; }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[rd] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vpkshus | PpcOpcode::vpkshus128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkshus128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = crate::vmx::as_i16x8(ctx.vr[ra]);
+            let b = crate::vmx::as_i16x8(ctx.vr[rb]);
+            let mut r = [0u8; 16]; let mut sat = false;
+            for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_u8(a[i]); r[i] = v; sat |= s; }
+            for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_u8(b[i]); r[8 + i] = v; sat |= s; }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[rd] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vpkshss | PpcOpcode::vpkshss128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkshss128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = crate::vmx::as_i16x8(ctx.vr[ra]);
+            let b = crate::vmx::as_i16x8(ctx.vr[rb]);
+            let mut r = [0i8; 16]; let mut sat = false;
+            for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_i8(a[i]); r[i] = v; sat |= s; }
+            for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_i8(b[i]); r[8 + i] = v; sat |= s; }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[rd] = crate::vmx::from_i8x16(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vpkuwus | PpcOpcode::vpkuwus128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkuwus128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = ctx.vr[ra].as_u32x4();
+            let b = ctx.vr[rb].as_u32x4();
+            let mut r = [0u16; 8]; let mut sat = false;
+            for i in 0..4 { let (v, s) = crate::vmx::sat_u32_to_u16(a[i]); r[i] = v; sat |= s; }
+            for i in 0..4 { let (v, s) = crate::vmx::sat_u32_to_u16(b[i]); r[4 + i] = v; sat |= s; }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[rd] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vpkswus | PpcOpcode::vpkswus128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkswus128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = crate::vmx::as_i32x4(ctx.vr[ra]);
+            let b = crate::vmx::as_i32x4(ctx.vr[rb]);
+            let mut r = [0u16; 8]; let mut sat = false;
+            for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_u16(a[i]); r[i] = v; sat |= s; }
+            for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_u16(b[i]); r[4 + i] = v; sat |= s; }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[rd] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vpkswss | PpcOpcode::vpkswss128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vpkswss128);
+            let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) }
+                               else { (instr.ra(), instr.rb(), instr.rd()) };
+            let a = crate::vmx::as_i32x4(ctx.vr[ra]);
+            let b = crate::vmx::as_i32x4(ctx.vr[rb]);
+            let mut r = [0i16; 8]; let mut sat = false;
+            for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_i16(a[i]); r[i] = v; sat |= s; }
+            for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_i16(b[i]); r[4 + i] = v; sat |= s; }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[rd] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        // vpkpx: pack two u32 vectors into one u16 (5-5-5 pixel) vector.
+        PpcOpcode::vpkpx => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u16; 8];
+            for i in 0..4 { r[i] = crate::vmx::pack_pixel_555(a[i]); }
+            for i in 0..4 { r[4 + i] = crate::vmx::pack_pixel_555(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+
+        // ---- Unpack (sign-extend) ----
+        PpcOpcode::vupkhsb | PpcOpcode::vupkhsb128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vupkhsb128);
+            let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) }
+                           else { (instr.rb(), instr.rd()) };
+            let b = crate::vmx::as_i8x16(ctx.vr[rb]);
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = b[i] as i16; }
+            ctx.vr[rd] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vupklsb | PpcOpcode::vupklsb128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vupklsb128);
+            let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) }
+                           else { (instr.rb(), instr.rd()) };
+            let b = crate::vmx::as_i8x16(ctx.vr[rb]);
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = b[8 + i] as i16; }
+            ctx.vr[rd] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vupkhsh => {
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 { r[i] = b[i] as i32; }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vupklsh => {
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 { r[i] = b[4 + i] as i32; }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vupkhpx => {
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = crate::vmx::unpack_pixel_555(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vupklpx => {
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = crate::vmx::unpack_pixel_555(b[4 + i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+
+        // ---- D3D pack / unpack (VMX128-only) ----
+        //
+        // First-Pixels M3: fixed immediate extraction + added pack types
+        // 1-6. The prior `(instr.raw >> 6) & 0x7` was LSB-numbered (wrong
+        // position) and masked to only 3 bits. Canary extracts from the
+        // VX128_3/4 `IMM` field at PPC bits 16-22 (MSB) and does
+        // `type = IMM >> 2` to pick up the 5-bit type selector — the low
+        // 2 bits (`pack`) select output-slot layout for `vpkd3d128` and
+        // are ignored by `vupkd3d128`. Extracting the low 2 bits as
+        // `pack` (unused here — we hand back the codec output in its
+        // canonical lane position, the subsequent permute instruction
+        // handles placement) for completeness.
+        PpcOpcode::vpkd3d128 => {
+            use crate::vmx::D3dPackType;
+            let uimm = crate::decoder::extract_vx128_uimm5(instr.raw);
+            let ty = D3dPackType::from_immediate(uimm >> 2);
+            let src = ctx.vr[instr.vb128()];
+            let out = match ty {
+                D3dPackType::D3dColor     => crate::vmx::pack_d3dcolor(src),
+                D3dPackType::NormShort2   => crate::vmx::pack_normshort2(src),
+                D3dPackType::NormPacked32 => crate::vmx::pack_normpacked32(src),
+                D3dPackType::Float16_2    => crate::vmx::pack_float16_2(src),
+                D3dPackType::NormShort4   => crate::vmx::pack_normshort4(src),
+                D3dPackType::Float16_4    => crate::vmx::pack_float16_4(src),
+                D3dPackType::NormPacked64 => crate::vmx::pack_normpacked64(src),
+                D3dPackType::Other(t)     => {
+                    tracing::warn!(
+                        raw = format_args!("{:#010x}", instr.raw),
+                        uimm,
+                        ty = t,
+                        "vpkd3d128: unhandled pack type at {:#010x}",
+                        ctx.pc,
+                    );
+                    src
+                }
+            };
+            ctx.vr[instr.vd128()] = out;
+            ctx.pc += 4;
+        }
+        PpcOpcode::vupkd3d128 => {
+            use crate::vmx::D3dPackType;
+            let uimm = crate::decoder::extract_vx128_uimm5(instr.raw);
+            let ty = D3dPackType::from_immediate(uimm >> 2);
+            let src = ctx.vr[instr.vb128()];
+            let out = match ty {
+                D3dPackType::D3dColor     => crate::vmx::unpack_d3dcolor(src),
+                D3dPackType::NormShort2   => crate::vmx::unpack_normshort2(src),
+                D3dPackType::NormPacked32 => crate::vmx::unpack_normpacked32(src),
+                D3dPackType::Float16_2    => crate::vmx::unpack_float16_2(src),
+                D3dPackType::NormShort4   => crate::vmx::unpack_normshort4(src),
+                D3dPackType::Float16_4    => crate::vmx::unpack_float16_4(src),
+                D3dPackType::NormPacked64 => crate::vmx::unpack_normpacked64(src),
+                D3dPackType::Other(t)     => {
+                    tracing::warn!(
+                        raw = format_args!("{:#010x}", instr.raw),
+                        uimm,
+                        ty = t,
+                        "vupkd3d128: unhandled pack type at {:#010x}",
+                        ctx.pc,
+                    );
+                    src
+                }
+            };
+            ctx.vr[instr.vd128()] = out;
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4g — VMX convert (float ↔ fixed-point)
+        // ═════════════════════════════════════════════════════════════════
+        // vctsxs / vctuxs: f32 → i32/u32, scaled by 2^uimm, saturating.
+        PpcOpcode::vctsxs => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = ctx.vr[instr.rb()].as_f32x4();
+            let mut r = [0i32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::cvt_f32_to_i32_sat(b[i], uimm);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vctuxs => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = ctx.vr[instr.rb()].as_f32x4();
+            let mut r = [0u32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::cvt_f32_to_u32_sat(b[i], uimm);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        // vcfsx / vcfux: i32/u32 → f32, scaled by 2^-uimm.
+        PpcOpcode::vcfsx => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0f32; 4];
+            for i in 0..4 { r[i] = crate::vmx::cvt_i32_to_f32(b[i], uimm); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcfux => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0f32; 4];
+            for i in 0..4 { r[i] = crate::vmx::cvt_u32_to_f32(b[i], uimm); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+        // VMX128 convert variants. uimm lives in bits 16-20 of the encoded form.
+        PpcOpcode::vcfpsxws128 => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = ctx.vr[instr.vb128()].as_f32x4();
+            let mut r = [0i32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::cvt_f32_to_i32_sat(b[i], uimm);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.vd128()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcfpuxws128 => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = ctx.vr[instr.vb128()].as_f32x4();
+            let mut r = [0u32; 4]; let mut sat = false;
+            for i in 0..4 {
+                let (v, s) = crate::vmx::cvt_f32_to_u32_sat(b[i], uimm);
+                r[i] = v; sat |= s;
+            }
+            if sat { ctx.set_vscr_sat(true); }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcsxwfp128 => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.vb128()]);
+            let mut r = [0f32; 4];
+            for i in 0..4 { r[i] = crate::vmx::cvt_i32_to_f32(b[i], uimm); }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vcuxwfp128 => {
+            let uimm = (instr.raw >> 16) & 0x1F;
+            let b = ctx.vr[instr.vb128()].as_u32x4();
+            let mut r = [0f32; 4];
+            for i in 0..4 { r[i] = crate::vmx::cvt_u32_to_f32(b[i], uimm); }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4h — VMX vector FPU (exp / log)
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::vexptefp | PpcOpcode::vexptefp128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vexptefp128);
+            let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) }
+                           else { (instr.rb(), instr.rd()) };
+            let b = ctx.vr[rb].as_f32x4();
+            let mut r = [0f32; 4];
+            for i in 0..4 { r[i] = b[i].exp2(); }
+            ctx.vr[rd] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vlogefp | PpcOpcode::vlogefp128 => {
+            let is_128 = matches!(instr.opcode, PpcOpcode::vlogefp128);
+            let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) }
+                           else { (instr.rb(), instr.rd()) };
+            let b = ctx.vr[rb].as_f32x4();
+            let mut r = [0f32; 4];
+            for i in 0..4 { r[i] = b[i].log2(); }
+            ctx.vr[rd] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4i — VMX integer max / min
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::vmaxub => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = a[i].max(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vminub => {
+            let a = ctx.vr[instr.ra()].as_bytes();
+            let b = ctx.vr[instr.rb()].as_bytes();
+            let mut r = [0u8; 16];
+            for i in 0..16 { r[i] = a[i].min(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmaxsb => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0i8; 16];
+            for i in 0..16 { r[i] = a[i].max(b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vminsb => {
+            let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]);
+            let mut r = [0i8; 16];
+            for i in 0..16 { r[i] = a[i].min(b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmaxuh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[i].max(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vminuh => {
+            let a = ctx.vr[instr.ra()].as_u16x8();
+            let b = ctx.vr[instr.rb()].as_u16x8();
+            let mut r = [0u16; 8];
+            for i in 0..8 { r[i] = a[i].min(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmaxsh => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = a[i].max(b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vminsh => {
+            let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]);
+            let mut r = [0i16; 8];
+            for i in 0..8 { r[i] = a[i].min(b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmaxuw => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = a[i].max(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vminuw => {
+            let a = ctx.vr[instr.ra()].as_u32x4();
+            let b = ctx.vr[instr.rb()].as_u32x4();
+            let mut r = [0u32; 4];
+            for i in 0..4 { r[i] = a[i].min(b[i]); }
+            ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmaxsw => {
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 { r[i] = a[i].max(b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vminsw => {
+            let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]);
+            let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]);
+            let mut r = [0i32; 4];
+            for i in 0..4 { r[i] = a[i].min(b[i]); }
+            ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4j — VMX128 FMA / permute
+        // ═════════════════════════════════════════════════════════════════
+        // vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator)
+        PpcOpcode::vmaddcfp128 => {
+            // Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both
+            // a source operand (as multiplicand) and the destination. Canary &
+            // POWER8 hardware confirm denormal inputs are flushed regardless of NJ.
+            let a = ctx.vr[instr.va128()].as_f32x4();
+            let b = ctx.vr[instr.vb128()].as_f32x4();
+            let d = ctx.vr[instr.vd128()].as_f32x4();
+            let mut r = [0f32; 4];
+            for i in 0..4 {
+                let ai = vmx::flush_denorm(a[i]);
+                let bi = vmx::flush_denorm(b[i]);
+                let di = vmx::flush_denorm(d[i]);
+                r[i] = di.mul_add(bi, ai);
+            }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r);
+            ctx.pc += 4;
+        }
+        // vmsum3fp128: horizontal sum of (vA * vB) over lanes 0..3, broadcast to all 4 output lanes.
+        // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal
+        // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075.
+        PpcOpcode::vmsum3fp128 => {
+            let a = ctx.vr[instr.va128()].as_f32x4();
+            let b = ctx.vr[instr.vb128()].as_f32x4();
+            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
+            ctx.pc += 4;
+        }
+        PpcOpcode::vmsum4fp128 => {
+            let a = ctx.vr[instr.va128()].as_f32x4();
+            let b = ctx.vr[instr.vb128()].as_f32x4();
+            let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]);
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s);
+            ctx.pc += 4;
+        }
+        // vpermwi128: permute words of vB using an 8-bit immediate (2 bits per output lane).
+        PpcOpcode::vpermwi128 => {
+            let imm = (instr.raw >> 16) & 0xFF;
+            let b = ctx.vr[instr.vb128()].as_u32x4();
+            let mut r = [0u32; 4];
+            // Output lane i ← b[(imm >> (2 * (3-i))) & 3]
+            for i in 0..4 {
+                let sel = ((imm >> (2 * (3 - i))) & 3) as usize;
+                r[i] = b[sel];
+            }
+            ctx.vr[instr.vd128()] = xenia_types::Vec128::from_u32x4_array(r);
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4k — Scalar reservation / byte-reverse (doubleword)
+        // ═════════════════════════════════════════════════════════════════
+        // M3.7 — same table-vs-legacy split as lwarx/stwcx.
+        PpcOpcode::ldarx => {
+            let ea = ea_indexed(ctx, instr);
+            let val = mem.read_u64(ea);
+            ctx.gpr[instr.rd()] = val;
+            ctx.reserved_line = ea & !RESERVATION_MASK;
+            ctx.reserved_val = val;
+            ctx.has_reservation = true;
+            if let Some(t) = &ctx.reservation_table {
+                if t.is_enabled() {
+                    ctx.reserved_generation = t.reserve(ea, ctx.hw_id);
+                }
+            }
+            ctx.pc += 4;
+        }
+        PpcOpcode::stdcx => {
+            let ea = ea_indexed(ctx, instr);
+            let line = ea & !RESERVATION_MASK;
+            let table_route = ctx
+                .reservation_table
+                .as_ref()
+                .filter(|t| t.is_enabled())
+                .cloned();
+            let success = if let Some(t) = &table_route {
+                ctx.has_reservation
+                    && ctx.reserved_line == line
+                    && t.try_commit(ea, ctx.reserved_generation, ctx.hw_id)
+            } else {
+                ctx.has_reservation && ctx.reserved_line == line
+            };
+            if success {
+                mem.write_u64(ea, ctx.gpr[instr.rs()]);
+                ctx.cr[0] = crate::context::CrField {
+                    lt: false,
+                    gt: false,
+                    eq: true,
+                    so: ctx.xer_so != 0,
+                };
+            } else {
+                ctx.cr[0] = crate::context::CrField {
+                    lt: false,
+                    gt: false,
+                    eq: false,
+                    so: ctx.xer_so != 0,
+                };
+                if let Some(t) = &table_route {
+                    t.release(ea, ctx.reserved_generation, ctx.hw_id);
+                }
+            }
+            ctx.has_reservation = false;
+            ctx.pc += 4;
+        }
+        PpcOpcode::ldbrx => {
+            let ea = ea_indexed(ctx, instr);
+            ctx.gpr[instr.rd()] = mem.read_u64(ea).swap_bytes();
+            ctx.pc += 4;
+        }
+        PpcOpcode::stdbrx => {
+            let ea = ea_indexed(ctx, instr);
+            mem.write_u64(ea, ctx.gpr[instr.rs()].swap_bytes());
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4l — Scalar string load / store (register-length)
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::lswx => {
+            let mut ea = ea_indexed(ctx, instr);
+            let nb = ctx.xer() & 0x7F; // XER[25..31]
+            let mut rd = instr.rd();
+            let mut bytes_left = nb;
+            while bytes_left > 0 {
+                let mut val = 0u32;
+                for byte_idx in 0..4 {
+                    if bytes_left == 0 { break; }
+                    let b = mem.read_u8(ea) as u32;
+                    val |= b << (24 - byte_idx * 8);
+                    ea = ea.wrapping_add(1);
+                    bytes_left -= 1;
+                }
+                ctx.gpr[rd] = val as u64;
+                rd = (rd + 1) % 32;
+            }
+            ctx.pc += 4;
+        }
+        PpcOpcode::stswx => {
+            let mut ea = ea_indexed(ctx, instr);
+            let nb = ctx.xer() & 0x7F;
+            let mut rs = instr.rs();
+            let mut bytes_left = nb;
+            while bytes_left > 0 {
+                let val = ctx.gpr[rs] as u32;
+                for byte_idx in 0..4 {
+                    if bytes_left == 0 { break; }
+                    mem.write_u8(ea, (val >> (24 - byte_idx * 8)) as u8);
+                    ea = ea.wrapping_add(1);
+                    bytes_left -= 1;
+                }
+                rs = (rs + 1) % 32;
+            }
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // §4m — mcrxr: move XER condition bits to CR field, clear XER[SO/OV/CA]
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::mcrxr => {
+            let crfd = instr.crfd();
+            ctx.cr[crfd] = crate::context::CrField {
+                lt: ctx.xer_so != 0,
+                gt: ctx.xer_ov != 0,
+                eq: ctx.xer_ca != 0,
+                so: false,
+            };
+            ctx.xer_so = 0;
+            ctx.xer_ov = 0;
+            ctx.xer_ca = 0;
+            ctx.pc += 4;
+        }
+
+        // ═════════════════════════════════════════════════════════════════
+        // mcrfs — move FPSCR field to CR field and clear corresponding
+        // FPSCR exception bits. CR field crfD ← FPSCR[(crfS*4)..(crfS*4+3)]
+        // and then FPSCR bits in that nibble that are exception bits are
+        // cleared (FX, OX, UX, ZX, XX, VXSNAN, VXISI, VXIDI, VXZDZ, VXIMZ,
+        // VXVC, VXSOFT, VXSQRT, VXCVI are cleared; FEX/VX are read-only
+        // summaries and are recomputed later).
+        // ═════════════════════════════════════════════════════════════════
+        PpcOpcode::mcrfs => {
+            let crfd = instr.crfd();
+            let crfs = instr.crfs();
+            let shift = 28 - (crfs as u32 * 4);
+            let nibble = ((ctx.fpscr >> shift) & 0xF) as u8;
+            ctx.cr[crfd] = crate::context::CrField::from_u8(nibble);
+            // Clearable exception bits: 0 (FX), 3 (OX), 4 (UX), 5 (ZX),
+            // 6 (XX), 7 (VXSNAN), 8 (VXISI), 9 (VXIDI), 10 (VXZDZ),
+            // 11 (VXIMZ), 12 (VXVC), 21 (VXSOFT), 22 (VXSQRT), 23 (VXCVI).
+            // (Bit positions are PowerISA MSB-0; here 'FPSCR bit n' means
+            // the bit at (31-n) in our little-endian u32.)
+            const CLEARABLE_MASK: u32 =
+                (1 << 31) | (1 << (31 - 3))  | (1 << (31 - 4))  |
+                (1 << (31 - 5)) | (1 << (31 - 6))  | (1 << (31 - 7))  |
+                (1 << (31 - 8)) | (1 << (31 - 9))  | (1 << (31 - 10)) |
+                (1 << (31 - 11)) | (1 << (31 - 12)) |
+                (1 << (31 - 21)) | (1 << (31 - 22)) | (1 << (31 - 23));
+            let nibble_mask = 0xFu32 << shift;
+            ctx.fpscr &= !(nibble_mask & CLEARABLE_MASK);
+            ctx.pc += 4;
+        }
+
         // Anything not yet implemented
         _ => {
             tracing::warn!("Unimplemented opcode at {:#010x}: {:?} [{:08X}]", ctx.pc, instr.opcode, instr.raw);
@@ -2212,6 +4261,13 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst
     StepResult::Continue
 }
 
+/// Compute an X-form indexed effective address: EA = (rA==0 ? 0 : GPR[rA]) + GPR[rB].
+#[inline]
+fn ea_indexed(ctx: &PpcContext, instr: &DecodedInstr) -> u32 {
+    let a = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
+    a.wrapping_add(ctx.gpr[instr.rb()]) as u32
+}
+
 /// Helper for CR logical operations.
 fn cr_logical(ctx: &mut PpcContext, instr: &DecodedInstr, op: fn(bool, bool) -> bool) {
     let a = ctx.get_cr_bit(instr.crba());
@@ -2274,70 +4330,84 @@ fn update_cr6_from_vmask(r: &[u32; 4], ctx: &mut PpcContext) {
 
 /// Round a double to single precision and back (matches xenia's ToSingle).
 #[inline]
-fn to_single(val: f64) -> f64 {
-    val as f32 as f64
+/// Round an f64 to single precision, honouring FPSCR[RN].
+fn to_single(ctx: &PpcContext, val: f64) -> f64 {
+    fpscr::round_to_single(ctx, val)
 }
 
 /// Update CR1 from FPSCR (used when Rc=1 on FPU instructions).
 /// CR1 = FPSCR[FX, FEX, VX, OX] (bits 0-3).
 #[inline]
 fn update_cr1_from_fpscr(ctx: &mut PpcContext) {
-    ctx.cr[1].lt = (ctx.fpscr >> 31) & 1 != 0; // FX
-    ctx.cr[1].gt = (ctx.fpscr >> 30) & 1 != 0; // FEX
-    ctx.cr[1].eq = (ctx.fpscr >> 29) & 1 != 0; // VX
-    ctx.cr[1].so = (ctx.fpscr >> 28) & 1 != 0; // OX
+    fpscr::update_cr1(ctx);
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
 
-    /// Simple test memory (64KB)
+    /// Simple test memory (64KB). Backed by `Box<[Cell<u8>]>` so the
+    /// MemoryAccess writes can take `&self`.
     struct TestMem {
-        data: Vec<u8>,
+        data: Box<[std::cell::Cell<u8>]>,
     }
 
     impl TestMem {
         fn new() -> Self {
-            Self { data: vec![0; 65536] }
+            Self {
+                data: (0..65536u32).map(|_| std::cell::Cell::new(0)).collect(),
+            }
         }
     }
 
     impl MemoryAccess for TestMem {
-        fn read_u8(&self, addr: u32) -> u8 { self.data[addr as usize] }
+        fn read_u8(&self, addr: u32) -> u8 { self.data[addr as usize].get() }
         fn read_u16(&self, addr: u32) -> u16 {
             let a = addr as usize;
-            u16::from_be_bytes([self.data[a], self.data[a+1]])
+            u16::from_be_bytes([self.data[a].get(), self.data[a+1].get()])
         }
         fn read_u32(&self, addr: u32) -> u32 {
             let a = addr as usize;
-            u32::from_be_bytes([self.data[a], self.data[a+1], self.data[a+2], self.data[a+3]])
+            u32::from_be_bytes([
+                self.data[a].get(), self.data[a+1].get(),
+                self.data[a+2].get(), self.data[a+3].get(),
+            ])
         }
         fn read_u64(&self, addr: u32) -> u64 {
             let a = addr as usize;
             u64::from_be_bytes([
-                self.data[a], self.data[a+1], self.data[a+2], self.data[a+3],
-                self.data[a+4], self.data[a+5], self.data[a+6], self.data[a+7],
+                self.data[a].get(), self.data[a+1].get(),
+                self.data[a+2].get(), self.data[a+3].get(),
+                self.data[a+4].get(), self.data[a+5].get(),
+                self.data[a+6].get(), self.data[a+7].get(),
             ])
         }
-        fn write_u8(&mut self, addr: u32, val: u8) { self.data[addr as usize] = val; }
-        fn write_u16(&mut self, addr: u32, val: u16) {
+        fn write_u8(&self, addr: u32, val: u8) { self.data[addr as usize].set(val); }
+        fn write_u16(&self, addr: u32, val: u16) {
             let a = addr as usize;
-            self.data[a..a+2].copy_from_slice(&val.to_be_bytes());
+            let bytes = val.to_be_bytes();
+            self.data[a].set(bytes[0]);
+            self.data[a+1].set(bytes[1]);
         }
-        fn write_u32(&mut self, addr: u32, val: u32) {
+        fn write_u32(&self, addr: u32, val: u32) {
             let a = addr as usize;
-            self.data[a..a+4].copy_from_slice(&val.to_be_bytes());
+            let bytes = val.to_be_bytes();
+            for (i, b) in bytes.iter().enumerate() {
+                self.data[a+i].set(*b);
+            }
         }
-        fn write_u64(&mut self, addr: u32, val: u64) {
+        fn write_u64(&self, addr: u32, val: u64) {
             let a = addr as usize;
-            self.data[a..a+8].copy_from_slice(&val.to_be_bytes());
+            let bytes = val.to_be_bytes();
+            for (i, b) in bytes.iter().enumerate() {
+                self.data[a+i].set(*b);
+            }
         }
         fn translate(&self, _addr: u32) -> Option<*const u8> { None }
-        fn translate_mut(&mut self, _addr: u32) -> Option<*mut u8> { None }
+        fn translate_mut(&self, _addr: u32) -> Option<*mut u8> { None }
     }
 
-    fn write_instr(mem: &mut TestMem, addr: u32, raw: u32) {
+    fn write_instr(mem: &TestMem, addr: u32, raw: u32) {
         mem.write_u32(addr, raw);
     }
 
@@ -2526,4 +4596,957 @@ mod tests {
         // (2.0 * 5.0) + 3.0 = 13.0
         assert!((ctx.fpr[4] - 13.0).abs() < 1e-10);
     }
+
+    #[test]
+    fn test_ctx_default_state_matches_canary() {
+        let ctx = PpcContext::new();
+        // LR initialized to halt sentinel so a top-level blr drops out cleanly.
+        assert_eq!(ctx.lr, crate::context::LR_HALT_SENTINEL);
+        // VSCR starts with NJ bit set (denormals flush to zero).
+        assert!(ctx.vscr_nj());
+        assert!(!ctx.vscr_sat());
+        // VRSAVE defaults to "save all" per canary.
+        assert_eq!(ctx.vrsave, 0xFFFF_FFFF);
+    }
+
+    #[test]
+    fn test_vaddubs_saturates_and_sets_vscr_sat() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // Fill vA with 0xF0, vB with 0x20 → 0x110, saturates to 0xFF per lane.
+        ctx.vr[2] = xenia_types::Vec128::from_bytes([0xF0; 16]);
+        ctx.vr[3] = xenia_types::Vec128::from_bytes([0x20; 16]);
+        // vaddubs vD=4, vA=2, vB=3. XO=512 (PPC: opcode 4, VA-form).
+        let raw: u32 = (4u32 << 26) | (4u32 << 21) | (2u32 << 16) | (3u32 << 11) | 512u32;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        let r = step(&mut ctx, &mut mem);
+        assert_eq!(r, StepResult::Continue);
+        assert_eq!(ctx.vr[4].as_bytes(), [0xFFu8; 16]);
+        assert!(ctx.vscr_sat(), "SAT should be set after saturation");
+    }
+
+    #[test]
+    fn test_ldarx_stdcx_pair() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        mem.write_u64(0x1000, 0xDEADBEEF_CAFEBABE);
+        ctx.gpr[4] = 0x1000;
+        ctx.gpr[5] = 0;
+        // ldarx r3, r4, r5:  (31 << 26) | (3<<21) | (4<<16) | (5<<11) | (84<<1)
+        let raw_ld: u32 = (31u32 << 26) | (3u32 << 21) | (4u32 << 16) | (5u32 << 11) | (84u32 << 1);
+        write_instr(&mut mem, 0, raw_ld);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.gpr[3], 0xDEADBEEF_CAFEBABE);
+        assert!(ctx.has_reservation);
+        // stdcx. r6, r4, r5: (31 << 26) | (6<<21) | (4<<16) | (5<<11) | (214<<1) | 1
+        ctx.gpr[6] = 0x1111_1111_2222_2222;
+        let raw_st: u32 = (31u32 << 26) | (6u32 << 21) | (4u32 << 16) | (5u32 << 11) | (214u32 << 1) | 1;
+        write_instr(&mut mem, 4, raw_st);
+        step(&mut ctx, &mut mem);
+        assert!(ctx.cr[0].eq, "stdcx. should succeed and set CR0.EQ");
+        assert_eq!(mem.read_u64(0x1000), 0x1111_1111_2222_2222);
+        assert!(!ctx.has_reservation);
+    }
+
+    #[test]
+    fn test_mcrxr_moves_xer_condition_bits_and_clears_them() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.xer_so = 1;
+        ctx.xer_ov = 0;
+        ctx.xer_ca = 1;
+        // mcrxr crfD=3: (31 << 26) | (3<<23) | (512<<1)
+        let raw: u32 = (31u32 << 26) | (3u32 << 23) | (512u32 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert!(ctx.cr[3].lt, "LT should mirror old XER[SO]");
+        assert!(!ctx.cr[3].gt, "GT should mirror old XER[OV]");
+        assert!(ctx.cr[3].eq, "EQ should mirror old XER[CA]");
+        assert_eq!(ctx.xer_so, 0);
+        assert_eq!(ctx.xer_ov, 0);
+        assert_eq!(ctx.xer_ca, 0);
+    }
+
+    // ---------- Phase 2 fixes: OE / overflow ----------
+
+    fn addx_raw(rd: u32, ra: u32, rb: u32, oe: bool, rc: bool) -> u32 {
+        (31 << 26) | (rd << 21) | (ra << 16) | (rb << 11)
+            | ((oe as u32) << 10) | (266 << 1) | (rc as u32)
+    }
+
+    #[test]
+    fn addo_sets_xer_ov_on_signed_overflow_and_stickies_so() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = i64::MAX as u64;
+        ctx.gpr[4] = 1;
+        write_instr(&mut mem, 0, addx_raw(5, 3, 4, true, false));
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.gpr[5], i64::MIN as u64);
+        assert_eq!(ctx.xer_ov, 1, "OV must be set on signed overflow");
+        assert_eq!(ctx.xer_so, 1, "SO must be stickied from OV");
+    }
+
+    #[test]
+    fn addo_clears_xer_ov_when_no_overflow_but_keeps_sticky_so() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.xer_ov = 1;  // stale from a previous overflow
+        ctx.xer_so = 1;
+        ctx.gpr[3] = 1;
+        ctx.gpr[4] = 2;
+        write_instr(&mut mem, 0, addx_raw(5, 3, 4, true, false));
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.gpr[5], 3);
+        assert_eq!(ctx.xer_ov, 0, "OV must clear when no overflow");
+        assert_eq!(ctx.xer_so, 1, "SO is sticky; stays set");
+    }
+
+    #[test]
+    fn add_without_oe_does_not_touch_xer() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = i64::MAX as u64;
+        ctx.gpr[4] = 1;
+        write_instr(&mut mem, 0, addx_raw(5, 3, 4, false, false));
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.xer_ov, 0);
+        assert_eq!(ctx.xer_so, 0);
+    }
+
+    #[test]
+    fn addx_rc_uses_64bit_compare_not_32bit() {
+        // r3 = 0x0000_0000_FFFF_FFFF, r4 = 0 → result = 0x0000_0000_FFFF_FFFF.
+        // As i32 this is -1 (lt). As i64 this is positive (gt). Spec says 64-bit.
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = 0x0000_0000_FFFF_FFFF;
+        ctx.gpr[4] = 0;
+        write_instr(&mut mem, 0, addx_raw(5, 3, 4, false, true));
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.gpr[5], 0x0000_0000_FFFF_FFFF);
+        assert!(!ctx.cr[0].lt, "64-bit compare: value is positive, not negative");
+        assert!(ctx.cr[0].gt);
+        assert!(!ctx.cr[0].eq);
+    }
+
+    #[test]
+    fn subfo_sets_xer_ov_on_min_minus_one() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // subfo r5, r3, r4  ->  r5 = r4 - r3
+        // r4 = INT64_MIN, r3 = 1 -> result overflows
+        ctx.gpr[3] = 1;
+        ctx.gpr[4] = i64::MIN as u64;
+        let raw = (31 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (1 << 10) | (40 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.xer_ov, 1);
+        assert_eq!(ctx.xer_so, 1);
+    }
+
+    #[test]
+    fn mullwo_sets_xer_ov_when_product_overflows_32_bits() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // mullwo r5, r3, r4  (XO=235, OE=1)
+        ctx.gpr[3] = i32::MAX as u64;
+        ctx.gpr[4] = 2u64;
+        let raw = (31 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (1 << 10) | (235 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.xer_ov, 1);
+        assert_eq!(ctx.xer_so, 1);
+    }
+
+    #[test]
+    fn divwo_sets_xer_ov_on_divide_by_zero() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // divwo r5, r3, r4  (XO=491, OE=1)
+        ctx.gpr[3] = 10;
+        ctx.gpr[4] = 0;
+        let raw = (31 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (1 << 10) | (491 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.xer_ov, 1);
+        assert_eq!(ctx.gpr[5], 0);  // undefined in spec; canary uses 0
+    }
+
+    #[test]
+    fn nego_sets_ov_only_on_int_min() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // nego r5, r3  (XO=104, OE=1)
+        ctx.gpr[3] = i64::MIN as u64;
+        let raw = (31 << 26) | (5 << 21) | (3 << 16) | (1 << 10) | (104 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.xer_ov, 1);
+        // -INT_MIN wraps to INT_MIN
+        assert_eq!(ctx.gpr[5], i64::MIN as u64);
+    }
+
+    // ---------- Phase 2 fixes: trap TO-field ----------
+
+    #[test]
+    fn tw_with_to_zero_never_fires() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = 5;
+        ctx.gpr[4] = 5;
+        // tw 0, r3, r4  (XO=4). TO in bits 6-10.
+        let raw = (31 << 26) | (0 << 21) | (3 << 16) | (4 << 11) | (4 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        let r = step(&mut ctx, &mut mem);
+        assert_eq!(r, StepResult::Continue, "TO=0 must never trap");
+        assert_eq!(ctx.pc, 4);
+    }
+
+    #[test]
+    fn tw_eq_fires_on_equal() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = 5;
+        ctx.gpr[4] = 5;
+        // TO=4 (EQ only)
+        let raw = (31 << 26) | (4 << 21) | (3 << 16) | (4 << 11) | (4 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        let r = step(&mut ctx, &mut mem);
+        assert_eq!(r, StepResult::Trap);
+    }
+
+    #[test]
+    fn tw_eq_does_not_fire_on_unequal() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = 5;
+        ctx.gpr[4] = 7;
+        // TO=4 (EQ only)
+        let raw = (31 << 26) | (4 << 21) | (3 << 16) | (4 << 11) | (4 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        let r = step(&mut ctx, &mut mem);
+        assert_eq!(r, StepResult::Continue);
+    }
+
+    #[test]
+    fn twi_compares_low_32_bits_only() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[3] = 0xDEAD_BEEF_0000_0005;  // low 32 = 5
+        // twi 4, r3, 5: primary=3, TO=4, RA=3, SI=5
+        let raw = (3 << 26) | (4 << 21) | (3 << 16) | (5u32 & 0xFFFF);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        let r = step(&mut ctx, &mut mem);
+        assert_eq!(r, StepResult::Trap, "word-width compare matches low 32");
+    }
+
+    // ---------- Phase 2 fixes: mcrfs ----------
+
+    // ---------- Phase 2h: FPU / FPSCR ----------
+
+    #[test]
+    fn fadd_inf_minus_inf_sets_vxisi() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.fpr[1] = f64::INFINITY;
+        ctx.fpr[2] = f64::NEG_INFINITY;
+        // fadd f3, f1, f2  → inf + (-inf) = VXISI
+        let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (21 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // VXISI bit is PPC bit 8 → mask 1<<23
+        assert_ne!(ctx.fpscr & fpscr::VXISI, 0);
+        // FX sticky is set on any new exception → mask 1<<31
+        assert_ne!(ctx.fpscr & fpscr::FX, 0);
+        // VX summary set → 1<<29
+        assert_ne!(ctx.fpscr & fpscr::VX, 0);
+    }
+
+    #[test]
+    fn fdiv_zero_over_zero_sets_vxzdz() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.fpr[1] = 0.0;
+        ctx.fpr[2] = 0.0;
+        // fdiv f3, f1, f2  (opcode 63, subop 18)
+        let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (18 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_ne!(ctx.fpscr & fpscr::VXZDZ, 0);
+    }
+
+    #[test]
+    fn fdiv_finite_over_zero_sets_zx() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.fpr[1] = 1.0;
+        ctx.fpr[2] = 0.0;
+        // fdiv f3, f1, f2
+        let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (18 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_ne!(ctx.fpscr & fpscr::ZX, 0);
+    }
+
+    #[test]
+    fn fadd_sets_fprf_from_result() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.fpr[1] = 2.5;
+        ctx.fpr[2] = 3.5;
+        let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (21 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // Result = +6.0 → FPRF = POS_NORMAL = 0b0_0100
+        let fprf = ((ctx.fpscr & fpscr::FPRF_MASK) >> 12) as u8;
+        assert_eq!(fprf, fpscr::fprf::POS_NORMAL);
+    }
+
+    #[test]
+    fn frsp_honours_fpscr_rn_toward_zero() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // A value whose nearest-even rounding would go up but trunc goes down.
+        // Choose v = 1 + 0x1.00_0001_ * 2^-24-ish so low bit is 1 in the f32 mantissa.
+        let v = f64::from_bits(0x3FF0_0000_0000_0001); // 1.0 + ULP at double
+        ctx.fpr[1] = v;
+        ctx.fpscr = 0x1;  // RN = 01 → toward zero
+        // frsp f3, f1 (opcode 63, subop 12)
+        let raw = (63u32 << 26) | (3 << 21) | (1 << 11) | (12 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // Result rounded toward zero should be exactly 1.0_f64
+        assert_eq!(ctx.fpr[3], 1.0_f64);
+    }
+
+    #[test]
+    fn fcmpu_sets_so_on_nan_and_fprf_unordered() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.fpr[1] = f64::NAN;
+        ctx.fpr[2] = 1.0;
+        // fcmpu crfD=4, f1, f2 : (63<<26) | (crfd<<23) | (ra<<16) | (rb<<11) | (0<<1)
+        let raw = (63u32 << 26) | (4 << 23) | (1 << 16) | (2 << 11);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert!(ctx.cr[4].so, "unordered → SO set");
+        assert!(!ctx.cr[4].lt && !ctx.cr[4].gt && !ctx.cr[4].eq);
+        // FPRF unordered = 0b0_0001
+        let fprf = ((ctx.fpscr & fpscr::FPRF_MASK) >> 12) as u8;
+        assert_eq!(fprf, 0b0_0001);
+    }
+
+    #[test]
+    fn fcmpo_on_qnan_sets_vxvc() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.fpr[1] = f64::NAN;  // QNaN (Rust's NAN)
+        ctx.fpr[2] = 1.0;
+        // fcmpo (opcode 63, subop 32)
+        let raw = (63u32 << 26) | (4 << 23) | (1 << 16) | (2 << 11) | (32 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_ne!(ctx.fpscr & fpscr::VXVC, 0);
+    }
+
+    // ---------- Phase 2i: VMX NaN propagation ----------
+
+    #[test]
+    fn vmaxfp_propagates_nan() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        let mut a = [1.0f32, 2.0, 3.0, 4.0];
+        let b = [5.0f32, 6.0, 7.0, 8.0];
+        a[1] = f32::NAN;
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array(a);
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array(b);
+        // vmaxfp vD=4, vA=2, vB=3  (opcode 4, XO=1034)
+        let raw = (4u32 << 26) | (4 << 21) | (2 << 16) | (3 << 11) | 1034;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        let r = ctx.vr[4].as_f32x4();
+        // lane 1 must be NaN, not 6.0 (the b side)
+        assert!(r[1].is_nan());
+        // Other lanes should pick the max correctly
+        assert_eq!(r[0], 5.0);
+        assert_eq!(r[2], 7.0);
+        assert_eq!(r[3], 8.0);
+    }
+
+    #[test]
+    fn vminfp_propagates_nan() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        let a = [1.0f32, 2.0, 3.0, 4.0];
+        let mut b = [5.0f32, 6.0, 7.0, 8.0];
+        b[2] = f32::NAN;
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array(a);
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array(b);
+        // vminfp XO=1098
+        let raw = (4u32 << 26) | (4 << 21) | (2 << 16) | (3 << 11) | 1098;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        let r = ctx.vr[4].as_f32x4();
+        assert!(r[2].is_nan());
+    }
+
+    // ---------- Phase 2j: VMX denorm flush ----------
+
+    #[test]
+    fn vmaddfp_flushes_denormal_inputs() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // Smallest positive denormal f32 = f32::from_bits(1)
+        let denorm = f32::from_bits(1);
+        let a = [denorm; 4];
+        let b = [0.0f32; 4];
+        let c = [1.0f32; 4];
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array(a);
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array(b);
+        ctx.vr[4] = xenia_types::Vec128::from_f32x4_array(c);
+        // vmaddfp vD=5, vA=2, vB=3, vC=4 (A-form: opcode 4, XO=46, vC at rc field)
+        // layout: (4<<26) | (5<<21) | (2<<16) | (3<<11) | (4<<6) | 46
+        let raw = (4u32 << 26) | (5 << 21) | (2 << 16) | (3 << 11) | (4 << 6) | 46;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        let r = ctx.vr[5].as_f32x4();
+        // denorm*1 + 0 should be flushed: denorm→0, so result is 0.
+        assert_eq!(r, [0.0f32; 4]);
+    }
+
+    /// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101)
+    /// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary
+    /// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs
+    /// unconditionally before the fused multiply-add — the 128-bit form
+    /// must match the scalar `vmaddfp` behaviour. Prior to this fix the
+    /// interpreter skipped the flush, leaving subnormal noise in math-
+    /// heavy game code.
+    #[test]
+    fn vmaddfp128_flushes_denormal_inputs() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        let denorm = f32::from_bits(1);
+        // vA=v2 carries denorms, which is also vD's accumulator input.
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
+        // vB=v3 = 1.0 — denormal input survives only if not flushed.
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]);
+        // vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared
+        // between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101.
+        let raw: u32 = 0x1440_18D0;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0.
+        // With flush:    0*0 + 0 = 0.
+        assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]);
+    }
+
+    /// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary
+    /// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the
+    /// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`.
+    #[test]
+    fn vnmsubfp128_flushes_denormal_inputs() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        let denorm = f32::from_bits(1);
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]);
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]);
+        // vnmsubfp128 vD=v2, vA=v2, vB=v3: key2 = 0b010101 (21) encoded
+        // via bits 22-25 = 0101 and bit 27 = 1.
+        let raw: u32 = 0x1440_1950;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // Without flush: denorm - denorm*1.0 = 0 (but the intermediate
+        // values propagate subnormals through the compute); with flush
+        // everything is 0 cleanly.
+        assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]);
+    }
+
+    /// VMX128 `vmsum4fp128 vD, vA, vB` computes the 4-lane dot product
+    /// and broadcasts the result. Canary
+    /// `ppc_emit_altivec.cc:1077-1084` flushes the *output* denormal
+    /// (not the inputs). A dot product that sums to a subnormal must
+    /// read back as 0.
+    #[test]
+    fn vmsum4fp128_flushes_denormal_output() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        let denorm = f32::from_bits(1);
+        // Dot product = denorm * 1.0 + 0 + 0 + 0 = denorm.
+        ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm, 0.0, 0.0, 0.0]);
+        ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32, 0.0, 0.0, 0.0]);
+        // vmsum4fp128 vD=v2, vA=v2, vB=v3: key2 = 0b011101 (29).
+        let raw: u32 = 0x1440_19D0;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // Subnormal output must flush to 0 and broadcast across all lanes.
+        assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]);
+    }
+
+    // ---------- Phase 2k: lve*x / stve*x element masking ----------
+
+    #[test]
+    fn lvebx_loads_byte_into_ea_slot() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        mem.write_u8(0x1003, 0xAB);
+        ctx.gpr[4] = 0x1003;
+        ctx.gpr[5] = 0;
+        // lvebx v1, r4, r5 : (31<<26) | (1<<21) | (4<<16) | (5<<11) | (7<<1)
+        let raw = (31u32 << 26) | (1 << 21) | (4 << 16) | (5 << 11) | (7 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        let bytes = ctx.vr[1].as_bytes();
+        // Byte at slot 3 (EA & 0xF = 3)
+        assert_eq!(bytes[3], 0xAB);
+        // Other bytes zero
+        for i in 0..16 {
+            if i != 3 { assert_eq!(bytes[i], 0, "byte {} should be zero", i); }
+        }
+    }
+
+    #[test]
+    fn stvewx_stores_only_word_slot() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // Prepare surrounding memory with a sentinel value so we can check non-overwrite.
+        for i in 0..16 {
+            mem.write_u8(0x1000 + i, 0x55);
+        }
+        // vS lanes: (big-endian view) word0=0xDEADBEEF, word1..3=0.
+        let mut src = [0u8; 16];
+        src[0] = 0xDE; src[1] = 0xAD; src[2] = 0xBE; src[3] = 0xEF;
+        ctx.vr[1] = xenia_types::Vec128::from_bytes(src);
+        // EA = 0x1000 (slot 0): store word0 at 0x1000.
+        ctx.gpr[4] = 0x1000;
+        ctx.gpr[5] = 0;
+        // stvewx v1, r4, r5 : (31<<26) | (1<<21) | (4<<16) | (5<<11) | (199<<1)
+        let raw = (31u32 << 26) | (1 << 21) | (4 << 16) | (5 << 11) | (199 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(mem.read_u32(0x1000), 0xDEAD_BEEF);
+        // Adjacent bytes untouched (still 0x55).
+        for i in 4..16 {
+            assert_eq!(mem.data[0x1000 + i as usize].get(), 0x55, "byte {} was overwritten", 0x1000+i);
+        }
+    }
+
+    // ---------- Phase 2l: reservation cache-line granule ----------
+
+    #[test]
+    fn stwcx_succeeds_within_same_cache_line() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        mem.write_u32(0x1004, 0xAAAA_AAAA);
+        ctx.gpr[4] = 0x1004;
+        ctx.gpr[5] = 0;
+        // lwarx r3, r4, r5 : (31<<26)|(3<<21)|(4<<16)|(5<<11)|(20<<1)
+        let ld = (31u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1);
+        write_instr(&mut mem, 0, ld);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert!(ctx.has_reservation);
+        // Now stwcx. to a DIFFERENT address in the SAME cache line (offset within 128 bytes).
+        ctx.gpr[4] = 0x1008; // 4 bytes over; same line.
+        ctx.gpr[6] = 0xBBBB_BBBB;
+        let st = (31u32 << 26) | (6 << 21) | (4 << 16) | (5 << 11) | (150 << 1) | 1;
+        write_instr(&mut mem, 4, st);
+        step(&mut ctx, &mut mem);
+        // Matches cache line → succeeds.
+        assert!(ctx.cr[0].eq);
+        assert_eq!(mem.read_u32(0x1008), 0xBBBB_BBBB);
+    }
+
+    #[test]
+    fn stwcx_fails_across_cache_lines() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.gpr[4] = 0x1000;
+        ctx.gpr[5] = 0;
+        let ld = (31u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1);
+        write_instr(&mut mem, 0, ld);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // Different cache line (0x1080).
+        ctx.gpr[4] = 0x1080;
+        ctx.gpr[6] = 0xCCCC_CCCC;
+        let st = (31u32 << 26) | (6 << 21) | (4 << 16) | (5 << 11) | (150 << 1) | 1;
+        write_instr(&mut mem, 4, st);
+        step(&mut ctx, &mut mem);
+        assert!(!ctx.cr[0].eq, "should fail across cache line");
+        assert_eq!(mem.read_u32(0x1080), 0, "memory not written on failure");
+    }
+
+    // ---------- Phase 2m: SPR DEC + TBL/TBU write ----------
+
+    #[test]
+    fn mfspr_dec_returns_dec_field() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.dec = 0x1234_5678;
+        // mfspr r3, DEC (22). SPR encoded with halves swapped: (22 & 0x1F)<<5 | (22>>5)&0x1F = 0x2C0 in bits 11..20.
+        // The decoder does the un-swap, so the raw SPR field stores the swapped form.
+        let spr_swapped = ((22u32 & 0x1F) << 5) | ((22u32 >> 5) & 0x1F);
+        let raw = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (339 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.gpr[3], 0x1234_5678);
+    }
+
+    #[test]
+    fn mtspr_tbl_write_updates_low_half() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        ctx.timebase = 0xAAAA_BBBB_CCCC_DDDD;
+        ctx.gpr[3] = 0x1111_2222;
+        // mtspr TBL_WRITE (284), r3
+        let spr_swapped = ((284u32 & 0x1F) << 5) | ((284u32 >> 5) & 0x1F);
+        let raw = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (467 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // step() post-increments timebase by 1, so the observable low half is value+1.
+        assert_eq!(ctx.timebase & 0xFFFF_FFFF, 0x1111_2222u64 + 1);
+        assert_eq!(ctx.timebase >> 32, 0xAAAA_BBBB);
+    }
+
+    // ---------- Block-cache parity tests ----------
+    //
+    // These confirm that running a program through the basic-block
+    // cache (crate::block_cache::BlockCache + step_block) produces a
+    // bit-identical PpcContext to running it through step_cached
+    // (per-instruction). If this ever fails the block cache is not
+    // safe to engage in production.
+
+    fn enc_addi_t(rd: u32, ra: u32, simm: i16) -> u32 {
+        (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
+    }
+    fn enc_lwz_t(rd: u32, ra: u32, d: i16) -> u32 {
+        (32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
+    }
+    fn enc_stw_t(rs: u32, ra: u32, d: i16) -> u32 {
+        (36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
+    }
+    fn enc_b_t(li_words: i32) -> u32 {
+        // Branch: opcode 18, AA=0, LK=0, LI = li_words << 2 (signed).
+        let li = (li_words as u32) & 0x00FF_FFFF;
+        (18u32 << 26) | (li << 2)
+    }
+
+    /// Snapshot of the parts of `PpcContext` that block_matches_per_instr
+    /// is asked to keep identical between dispatch paths. Comparing the
+    /// whole struct is impractical (vector regs, fp regs, large arrays);
+    /// the GPR file + pc + lr + cr + cycle counters cover everything the
+    /// interpreter touches in the test programs below.
+    #[derive(Debug, PartialEq, Eq)]
+    struct CtxSnap {
+        gpr: [u64; 32],
+        pc: u32,
+        lr: u64,
+        ctr: u64,
+        cycle_count: u64,
+        cr_packed: u32,
+    }
+    impl CtxSnap {
+        fn from(ctx: &PpcContext) -> Self {
+            Self {
+                gpr: ctx.gpr,
+                pc: ctx.pc,
+                lr: ctx.lr,
+                ctr: ctx.ctr,
+                cycle_count: ctx.cycle_count,
+                cr_packed: ctx.cr(),
+            }
+        }
+    }
+
+    fn run_per_instruction(prog: &[u32], iters: u32, init_gpr: &[(usize, u64)]) -> CtxSnap {
+        let mut ctx = PpcContext::new();
+        for &(i, v) in init_gpr {
+            ctx.gpr[i] = v;
+        }
+        let mut mem = TestMem::new();
+        for (i, &raw) in prog.iter().enumerate() {
+            write_instr(&mut mem, (i as u32) * 4, raw);
+        }
+        let mut cache = crate::decoder::DecodeCache::new();
+        ctx.pc = 0;
+        for _ in 0..iters {
+            // Run one instruction at a time. Memory has constant
+            // page_version (default trait impl returns 1) so the cache
+            // entries stay valid forever.
+            let r = step_cached(&mut ctx, &mut mem, &mut cache, 1);
+            assert!(matches!(r, StepResult::Continue));
+        }
+        CtxSnap::from(&ctx)
+    }
+
+    fn run_block(prog: &[u32], iters: u32, init_gpr: &[(usize, u64)]) -> CtxSnap {
+        let mut ctx = PpcContext::new();
+        for &(i, v) in init_gpr {
+            ctx.gpr[i] = v;
+        }
+        let mut mem = TestMem::new();
+        for (i, &raw) in prog.iter().enumerate() {
+            write_instr(&mut mem, (i as u32) * 4, raw);
+        }
+        let mut bc = crate::block_cache::BlockCache::new();
+        ctx.pc = 0;
+        let mut total_steps = 0u32;
+        // Iterate by *blocks* until we've covered at least `iters`
+        // instructions. The block path runs N instructions per call
+        // where N is the block length; we still want to compare on a
+        // per-instruction footing, so accumulate cycle_count.
+        while total_steps < iters {
+            // Borrow bc only long enough to copy the slice we need —
+            // step_block needs &mut MemoryAccess so we can't hold a
+            // shared borrow on bc across the call.
+            let block_ptr: *const crate::block_cache::DecodedBlock = {
+                let b: &crate::block_cache::DecodedBlock = bc.lookup_or_build(ctx.pc, &mem);
+                b
+            };
+            // Safety: the BlockCache::lookup_or_build contract is that
+            // the returned reference stays valid until the next
+            // lookup_or_build on the same cache. We don't call
+            // lookup_or_build inside step_block and we drop the raw
+            // pointer at the end of the iteration, so no aliasing.
+            let block: &crate::block_cache::DecodedBlock = unsafe { &*block_ptr };
+            let n_before = ctx.cycle_count;
+            let r = step_block(&mut ctx, &mut mem, block);
+            assert!(matches!(r, StepResult::Continue));
+            let stepped = (ctx.cycle_count - n_before) as u32;
+            total_steps += stepped;
+        }
+        CtxSnap::from(&ctx)
+    }
+
+    #[test]
+    fn block_dispatch_matches_per_instruction_alu_loop() {
+        // 4-instruction loop: r3 += 1, r3 += 2, r3 += 3, b -12 (back to start).
+        let prog = [
+            enc_addi_t(3, 3, 1),
+            enc_addi_t(3, 3, 2),
+            enc_addi_t(3, 3, 3),
+            enc_b_t(-3), // -3 words → back to instr 0
+        ];
+        let init = [(3usize, 0u64)];
+        let snap_a = run_per_instruction(&prog, 100, &init);
+        let snap_b = run_block(&prog, 100, &init);
+        assert_eq!(snap_a, snap_b);
+    }
+
+    #[test]
+    fn block_dispatch_matches_per_instruction_loadstore_loop() {
+        // r4 = 0x800 (data pointer), r3 = 1
+        // loop:
+        //   stw  r3, 0(r4)
+        //   lwz  r5, 0(r4)
+        //   addi r3, r5, 1
+        //   b -12
+        let prog = [
+            enc_stw_t(3, 4, 0),
+            enc_lwz_t(5, 4, 0),
+            enc_addi_t(3, 5, 1),
+            enc_b_t(-3),
+        ];
+        let init = [(3usize, 1u64), (4usize, 0x800u64)];
+        let snap_a = run_per_instruction(&prog, 200, &init);
+        let snap_b = run_block(&prog, 200, &init);
+        assert_eq!(snap_a, snap_b);
+    }
+
+    #[test]
+    fn mcrfs_moves_fpscr_nibble_and_clears_exception_bits() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // Set FPSCR bit 0 (FX) = 1 and bit 3 (OX) = 1. In our layout:
+        //   FX at (31-0) = 31
+        //   OX at (31-3) = 28
+        ctx.fpscr = (1u32 << 31) | (1u32 << 28);
+        // mcrfs crfD=2, crfS=0:  (63 << 26) | (crfD<<23) | (crfS<<18) | (64<<1)
+        let raw = (63 << 26) | (2 << 23) | (0 << 18) | (64 << 1);
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        // FPSCR bits 0..3 of crfS=0 → ctx.cr[2] should have FX(lt)=1 and OX(so)=0
+        //                                 and the FEX/VX nibble positions are bits 1,2
+        // Nibble contents: FX=1, FEX=0, VX=0, OX=1 → 0b1001 = 9
+        assert_eq!(ctx.cr[2].as_u8(), 0b1001);
+        // FX and OX are clearable → FPSCR now has those nibble bits cleared
+        assert_eq!(ctx.fpscr & (1 << 31), 0, "FX cleared");
+        assert_eq!(ctx.fpscr & (1 << 28), 0, "OX cleared");
+    }
+
+    /// Regression: `subfze` is `RT ← !RA + CA` (no -1 term), so 64-bit
+    /// carry-out only happens when `RA == 0 && CA == 1`. The previous
+    /// predicate (`!ra != 0 || ca != 0`) was copy-pasted from `subfme`
+    /// and reported CA=1 in nearly every case.
+    #[test]
+    fn test_subfze_carry_only_when_ra_zero_and_ca_one() {
+        // subfze rD, rA: opcode 31, XO=200 (bits 22-30), OE=0, Rc=0.
+        // Encoding: (31<<26) | (rd<<21) | (ra<<16) | (200<<1)
+        let raw = (31u32 << 26) | (3 << 21) | (4 << 16) | (200 << 1);
+
+        // Case 1: ra=0, ca=1 → CA=1 (the only carry case)
+        {
+            let mut ctx = PpcContext::new();
+            let mem = TestMem::new();
+            write_instr(&mem, 0, raw);
+            ctx.pc = 0;
+            ctx.gpr[4] = 0;
+            ctx.xer_ca = 1;
+            step(&mut ctx, &mem);
+            assert_eq!(ctx.xer_ca, 1, "ra=0, ca=1 should produce CA=1");
+            assert_eq!(ctx.gpr[3], 0, "result = !0 + 1 = 0 (wraps)");
+        }
+        // Case 2: ra=0, ca=0 → CA=0  (old buggy code reported CA=1)
+        {
+            let mut ctx = PpcContext::new();
+            let mem = TestMem::new();
+            write_instr(&mem, 0, raw);
+            ctx.pc = 0;
+            ctx.gpr[4] = 0;
+            ctx.xer_ca = 0;
+            step(&mut ctx, &mem);
+            assert_eq!(ctx.xer_ca, 0, "ra=0, ca=0 should produce CA=0");
+            assert_eq!(ctx.gpr[3], u64::MAX, "result = !0 + 0 = u64::MAX");
+        }
+        // Case 3: ra=1, ca=0 → CA=0  (old buggy code reported CA=1)
+        {
+            let mut ctx = PpcContext::new();
+            let mem = TestMem::new();
+            write_instr(&mem, 0, raw);
+            ctx.pc = 0;
+            ctx.gpr[4] = 1;
+            ctx.xer_ca = 0;
+            step(&mut ctx, &mem);
+            assert_eq!(ctx.xer_ca, 0, "ra=1, ca=0 should produce CA=0");
+            assert_eq!(ctx.gpr[3], u64::MAX - 1, "result = !1 + 0 = u64::MAX - 1");
+        }
+        // Case 4: ra=u64::MAX, ca=0 → CA=0  (old buggy code reported CA=1
+        // because !ra == 0 only here, which the buggy `!ra != 0` predicate
+        // happened to handle right; flip ca=1 to exercise the other arm)
+        {
+            let mut ctx = PpcContext::new();
+            let mem = TestMem::new();
+            write_instr(&mem, 0, raw);
+            ctx.pc = 0;
+            ctx.gpr[4] = u64::MAX;
+            ctx.xer_ca = 1;
+            step(&mut ctx, &mem);
+            assert_eq!(ctx.xer_ca, 0, "ra=u64::MAX, ca=1 should produce CA=0");
+            assert_eq!(ctx.gpr[3], 1, "result = !u64::MAX + 1 = 1");
+        }
+    }
+
+    /// Regression: `cmp` (L=1) must not derive LT/GT from the sign of a
+    /// (potentially overflowing) 64-bit subtract. The old code used
+    /// `update_cr_signed(bf, ra.wrapping_sub(rb))` which mis-signed the
+    /// result for boundary i64 values like `ra=i64::MIN, rb=1`.
+    #[test]
+    fn test_cmp_signed_at_i64_boundaries() {
+        // cmp BF=0, L=1, RA, RB: (31<<26) | (1<<21) | (ra<<16) | (rb<<11)
+        // (XO=0; Rc field is reserved on cmp, leave 0)
+        let raw = |ra: u32, rb: u32| (31u32 << 26) | (1 << 21) | (ra << 16) | (rb << 11);
+
+        // i64::MIN < 1 → LT must be set
+        {
+            let mut ctx = PpcContext::new();
+            let mem = TestMem::new();
+            write_instr(&mem, 0, raw(3, 4));
+            ctx.pc = 0;
+            ctx.gpr[3] = i64::MIN as u64;
+            ctx.gpr[4] = 1;
+            step(&mut ctx, &mem);
+            assert!(ctx.cr[0].lt, "i64::MIN < 1 must be LT");
+            assert!(!ctx.cr[0].gt);
+            assert!(!ctx.cr[0].eq);
+        }
+        // i64::MAX > -1 → GT must be set (the symmetric overflow corner)
+        {
+            let mut ctx = PpcContext::new();
+            let mem = TestMem::new();
+            write_instr(&mem, 0, raw(3, 4));
+            ctx.pc = 0;
+            ctx.gpr[3] = i64::MAX as u64;
+            ctx.gpr[4] = (-1i64) as u64;
+            step(&mut ctx, &mem);
+            assert!(!ctx.cr[0].lt);
+            assert!(ctx.cr[0].gt, "i64::MAX > -1 must be GT");
+            assert!(!ctx.cr[0].eq);
+        }
+        // Equal at the extreme is still EQ
+        {
+            let mut ctx = PpcContext::new();
+            let mem = TestMem::new();
+            write_instr(&mem, 0, raw(3, 4));
+            ctx.pc = 0;
+            ctx.gpr[3] = i64::MIN as u64;
+            ctx.gpr[4] = i64::MIN as u64;
+            step(&mut ctx, &mem);
+            assert!(!ctx.cr[0].lt);
+            assert!(!ctx.cr[0].gt);
+            assert!(ctx.cr[0].eq, "i64::MIN == i64::MIN must be EQ");
+        }
+    }
+
+    /// Regression: `lvebx` must preserve the prior contents of the
+    /// destination VR for lanes other than the loaded byte. Previously
+    /// the handler started from a zeroed buffer.
+    #[test]
+    fn test_lvebx_preserves_other_lanes() {
+        let mut ctx = PpcContext::new();
+        let mem = TestMem::new();
+        // Pre-seed vr[3] with a recognizable pattern.
+        let pattern: [u8; 16] = [
+            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+        ];
+        ctx.vr[3] = xenia_types::Vec128::from_bytes(pattern);
+        // Place a byte at memory address 0x1004; EA & 0xF == 4 → slot 4.
+        mem.write_u8(0x1004, 0xAB);
+        // r4 = 0x1000, r5 = 4 → EA = 0x1004
+        ctx.gpr[4] = 0x1000;
+        ctx.gpr[5] = 4;
+        // lvebx vD=3, rA=4, rB=5: opcode 31, XO=7 → (31<<26)|(3<<21)|(4<<16)|(5<<11)|(7<<1)
+        let raw = (31u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (7 << 1);
+        write_instr(&mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mem);
+        // Expected: lane 4 holds 0xAB, every other lane unchanged.
+        let mut expected = pattern;
+        expected[4] = 0xAB;
+        assert_eq!(ctx.vr[3].as_bytes(), expected);
+    }
 }
diff --git a/crates/xenia-cpu/src/lib.rs b/crates/xenia-cpu/src/lib.rs
index b84cb73..576a79e 100644
--- a/crates/xenia-cpu/src/lib.rs
+++ b/crates/xenia-cpu/src/lib.rs
@@ -1,9 +1,25 @@
+pub mod block_cache;
 pub mod context;
 pub mod decoder;
 pub mod disasm;
+pub mod fpscr;
 pub mod interpreter;
 pub mod opcode;
+pub mod overflow;
+pub mod phaser;
+pub mod reservation;
+pub mod scheduler;
+pub mod trap;
+pub mod vmx;
 
 pub use context::PpcContext;
 pub use decoder::decode;
+pub use disasm::{DisasmItem, DisasmText, disassemble, format as disasm_format, iter_disasm};
 pub use opcode::PpcOpcode;
+pub use phaser::{Phaser, PhaserOutcome};
+pub use reservation::ReservationTable;
+pub use scheduler::{
+    BlockReason, GuestThread, HwSlot, HwState, MigrationFixup, OrderMode, PcrWriter, RoundOutcome,
+    Scheduler, SpawnError, SpawnParams, ThreadRef, HW_THREAD_COUNT, INITIAL_GUEST_TID,
+    QUANTUM_DEFAULT,
+};
diff --git a/crates/xenia-cpu/src/opcode.rs b/crates/xenia-cpu/src/opcode.rs
index 01fb77c..e3361ec 100644
--- a/crates/xenia-cpu/src/opcode.rs
+++ b/crates/xenia-cpu/src/opcode.rs
@@ -145,6 +145,33 @@ impl PpcOpcode {
         matches!(self, Self::sc)
     }
 
+    /// Returns true if this opcode unconditionally ends a basic block:
+    /// any branch, system call, trap, or `Invalid` (decoder couldn't
+    /// recognize the instruction — execution will hit the
+    /// `Unimplemented` arm and we don't want to swallow the boundary
+    /// inside a cached block).
+    ///
+    /// Notably *not* terminating: `mtmsr`/`mtmsrd`/`isync`/`mfmsr`.
+    /// On real hardware these have synchronization semantics (a context
+    /// synchronizing event for `isync`, MSR rewrite for the `mt*`s) but
+    /// our interpreter has no asynchronous-exception model and no
+    /// out-of-order execution — they execute as plain ALU/move ops and
+    /// don't change control flow synchronously. Block-cache replay is
+    /// still bit-for-bit identical to per-instruction dispatch for
+    /// those.
+    ///
+    /// Used by the basic-block cache (`block_cache.rs`) to know when to
+    /// stop accumulating instructions during a forward decode walk.
+    pub fn terminates_block(&self) -> bool {
+        matches!(
+            self,
+            Self::bx | Self::bcx | Self::bclrx | Self::bcctrx
+                | Self::sc
+                | Self::td | Self::tdi | Self::tw | Self::twi
+                | Self::Invalid
+        )
+    }
+
     /// Returns true if this is a load instruction.
     pub fn is_load(&self) -> bool {
         matches!(self,
@@ -194,3 +221,60 @@ impl std::fmt::Display for PpcOpcode {
         std::fmt::Debug::fmt(self, f)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn terminates_block_includes_all_branches() {
+        assert!(PpcOpcode::bx.terminates_block());
+        assert!(PpcOpcode::bcx.terminates_block());
+        assert!(PpcOpcode::bclrx.terminates_block());
+        assert!(PpcOpcode::bcctrx.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_includes_sc_and_traps() {
+        assert!(PpcOpcode::sc.terminates_block());
+        assert!(PpcOpcode::td.terminates_block());
+        assert!(PpcOpcode::tdi.terminates_block());
+        assert!(PpcOpcode::tw.terminates_block());
+        assert!(PpcOpcode::twi.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_includes_invalid() {
+        // Decoder failure must end the block — otherwise an unknown
+        // opcode would be replayed inside a cached block without going
+        // through the per-instruction Unimplemented path.
+        assert!(PpcOpcode::Invalid.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_excludes_straight_line_ops() {
+        // Common ALU and load/store ops must NOT terminate a block.
+        assert!(!PpcOpcode::addi.terminates_block());
+        assert!(!PpcOpcode::addis.terminates_block());
+        assert!(!PpcOpcode::addx.terminates_block());
+        assert!(!PpcOpcode::cmpi.terminates_block());
+        assert!(!PpcOpcode::cmp.terminates_block());
+        assert!(!PpcOpcode::lwz.terminates_block());
+        assert!(!PpcOpcode::stw.terminates_block());
+        assert!(!PpcOpcode::lbzx.terminates_block());
+        assert!(!PpcOpcode::ori.terminates_block());
+        assert!(!PpcOpcode::oris.terminates_block());
+        assert!(!PpcOpcode::rlwinmx.terminates_block());
+    }
+
+    #[test]
+    fn terminates_block_excludes_msr_and_sync_ops() {
+        // Documented decision: synchronizing ops execute as ALU within
+        // a block since the interpreter has no async-exception model.
+        assert!(!PpcOpcode::mtmsr.terminates_block());
+        assert!(!PpcOpcode::mtmsrd.terminates_block());
+        assert!(!PpcOpcode::isync.terminates_block());
+        assert!(!PpcOpcode::sync.terminates_block());
+        assert!(!PpcOpcode::mfmsr.terminates_block());
+    }
+}
diff --git a/crates/xenia-cpu/src/overflow.rs b/crates/xenia-cpu/src/overflow.rs
new file mode 100644
index 0000000..a55f505
--- /dev/null
+++ b/crates/xenia-cpu/src/overflow.rs
@@ -0,0 +1,173 @@
+//! OE / XER[OV] / XER[SO] handling for integer arithmetic.
+//!
+//! PPC integer ops with the OE bit set update XER[OV] (overflow) and sticky-set
+//! XER[SO]. When OE is clear the instruction leaves XER untouched. Signed
+//! overflow is predicated on the operation width and operand signs per the
+//! PowerISA pseudocode. For 32-bit-word operations (`addw`, `mullw`, `divw`,
+//! `neg`, etc. — on PPC these all have `w` in the mnemonic in spec
+//! descriptions even when the assembler spells them without) the predicate
+//! uses the low 32 bits. For 64-bit operations (`add`, `mulld`, `divd`) the
+//! predicate uses the full 64 bits.
+
+use crate::context::PpcContext;
+
+#[inline]
+pub fn apply(ctx: &mut PpcContext, overflowed: bool) {
+    if overflowed {
+        ctx.xer_ov = 1;
+        ctx.xer_so = 1;
+    } else {
+        ctx.xer_ov = 0;
+    }
+}
+
+/// Signed addition overflow at width-64 (plain `add`, `addc`, `subf`, `subfc`).
+///
+/// Predicate: same-sign inputs with opposite-sign result.
+/// For sub callers, rewrite as `a + b'` first (see `_sub`).
+#[inline]
+pub fn add_ov_64(a: u64, b: u64, result: u64) -> bool {
+    ((!(a ^ b)) & (a ^ result)) >> 63 != 0
+}
+
+/// Universal signed-overflow predicate for 64-bit arithmetic.
+///
+/// Caller computes the mathematical (infinite-precision) signed sum as i128,
+/// plus the stored 64-bit result. Overflow iff the two disagree — i.e. the
+/// true value doesn't fit in i64.
+///
+/// Use this for multi-term chains (`adde`, `addme`, `addze`, `subfe`, `subfme`,
+/// `subfze`) where the carry-in makes the bit-predicate above awkward.
+#[inline]
+pub fn sum_overflow_64(true_sum: i128, result: u64) -> bool {
+    true_sum != (result as i64) as i128
+}
+
+/// Signed subtraction: RT = b - a. Overflow iff opposite-sign inputs with
+/// result sign != b's sign. Equivalently, reduce to addition with `!a + 1`.
+#[inline]
+pub fn sub_ov_64(a: u64, b: u64, result: u64) -> bool {
+    ((a ^ b) & (b ^ result)) >> 63 != 0
+}
+
+/// Signed `addc`/`adde` chain overflow. Same rule as `add_ov_64` — the carry
+/// in doesn't alter the sign predicate directly because it's already folded
+/// into the stored result.
+#[inline]
+pub fn adde_ov_64(a: u64, b: u64, result: u64) -> bool {
+    add_ov_64(a, b, result)
+}
+
+/// Signed 32-bit multiply overflow (`mullwo`): result fits in 32 bits signed
+/// iff bit 32 equals bits 33..63 of the 64-bit product.
+#[inline]
+pub fn mullw_ov(product: i64) -> bool {
+    let lo = product as i32 as i64;
+    lo != product
+}
+
+/// Signed 64-bit multiply overflow (`mulldo`). Detected via checked_mul.
+#[inline]
+pub fn mulld_ov(a: i64, b: i64) -> bool {
+    a.checked_mul(b).is_none()
+}
+
+/// `divwo` / `divwuo` / `divdo` / `divduo` raise OV in two cases:
+///   * divisor is zero, or
+///   * signed division of `INT_MIN / -1` (quotient doesn't fit).
+#[inline]
+pub fn divw_ov_signed(ra: i32, rb: i32) -> bool {
+    rb == 0 || (ra == i32::MIN && rb == -1)
+}
+
+#[inline]
+pub fn divw_ov_unsigned(rb: u32) -> bool {
+    rb == 0
+}
+
+#[inline]
+pub fn divd_ov_signed(ra: i64, rb: i64) -> bool {
+    rb == 0 || (ra == i64::MIN && rb == -1)
+}
+
+#[inline]
+pub fn divd_ov_unsigned(rb: u64) -> bool {
+    rb == 0
+}
+
+/// `negx`: RT = -(RA). Overflow only when RA = INT_MIN (the negation doesn't fit).
+#[inline]
+pub fn neg_ov_64(ra: u64) -> bool {
+    ra == 0x8000_0000_0000_0000
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn add_no_overflow() {
+        assert!(!add_ov_64(1, 2, 3));
+        assert!(!add_ov_64(u64::MAX, 0, u64::MAX));
+    }
+
+    #[test]
+    fn add_positive_overflow() {
+        // INT64_MAX + 1 = INT64_MIN — signed overflow
+        let a = i64::MAX as u64;
+        let b = 1u64;
+        let r = a.wrapping_add(b);
+        assert!(add_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn add_negative_overflow() {
+        // INT64_MIN + -1 = INT64_MAX — signed overflow
+        let a = i64::MIN as u64;
+        let b = (-1i64) as u64;
+        let r = a.wrapping_add(b);
+        assert!(add_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn sub_overflow_min_minus_pos() {
+        // INT64_MIN - 1 overflows
+        let b = i64::MIN as u64;
+        let a = 1u64;
+        let r = b.wrapping_sub(a);
+        assert!(sub_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn sub_no_overflow() {
+        let b = 5u64;
+        let a = 2u64;
+        let r = b.wrapping_sub(a);
+        assert!(!sub_ov_64(a, b, r));
+    }
+
+    #[test]
+    fn mullw_fits_32_bits() {
+        assert!(!mullw_ov((i32::MAX as i64) * 1));
+        assert!(!mullw_ov(-1i64));
+    }
+
+    #[test]
+    fn mullw_overflows_32_bits() {
+        let p = (i32::MAX as i64) * 2;
+        assert!(mullw_ov(p));
+    }
+
+    #[test]
+    fn mulld_overflows() {
+        assert!(mulld_ov(i64::MAX, 2));
+        assert!(!mulld_ov(i64::MAX, 1));
+    }
+
+    #[test]
+    fn neg_ov_only_at_min() {
+        assert!(neg_ov_64(i64::MIN as u64));
+        assert!(!neg_ov_64(0));
+        assert!(!neg_ov_64(1));
+    }
+}
diff --git a/crates/xenia-cpu/src/phaser.rs b/crates/xenia-cpu/src/phaser.rs
new file mode 100644
index 0000000..e9d2fcf
--- /dev/null
+++ b/crates/xenia-cpu/src/phaser.rs
@@ -0,0 +1,345 @@
+//! Quantum-boundary phaser for the M3 per-HW-thread parallel scheduler.
+//!
+//! Six [`super::HW_THREAD_COUNT`] host threads run their slots' interpreters
+//! in parallel, then meet at a phaser to advance to the next quantum. This
+//! is **not** [`std::sync::Barrier`]: a Barrier needs a fixed party count,
+//! but our slots can become idle (no runnable thread) and shouldn't block
+//! the phaser arrival.
+//!
+//! ## Semantics
+//!
+//! - Each slot at the end of its quantum either calls
+//!   [`Phaser::arrive_and_wait`] (it has a runnable thread to run next
+//!   quantum) or [`Phaser::skip`] (it's idle this round and will wake on
+//!   `slot_wake[i]`).
+//! - The phase advances when **all 6 slots have either arrived or
+//!   skipped**. Arrived slots block until the advance; skipped slots
+//!   return immediately and re-poll their wake state.
+//! - The phaser uses a generation counter so a slot that arrives "early"
+//!   in the next phase doesn't see the prior phase's "all arrived"
+//!   condition.
+//! - Defensive timeout: [`Phaser::arrive_and_wait_timeout`] returns
+//!   [`PhaserOutcome::Timeout`] if a peer crashes / hangs. Callers
+//!   typically convert this into a graceful shutdown rather than
+//!   panicking, so the rest of the topology can tear down cleanly.
+//!
+//! ## Memory ordering
+//!
+//! - The participant counter (`arrived` + `skipped`) uses `AcqRel` on
+//!   the increment so the last-to-arrive thread sees a consistent
+//!   "everyone is here" snapshot.
+//! - The generation `phase` is read with `Acquire` in arrivers' wait
+//!   loops; the advancing thread stores with `Release` after bumping.
+//! - The condvar's broadcast publishes the phase; the wait loop
+//!   re-checks `phase` against its captured value to defend against
+//!   spurious wakeups.
+
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::{Condvar, Mutex};
+use std::time::{Duration, Instant};
+
+/// Outcome of a phaser arrival.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PhaserOutcome {
+    /// All participants arrived/skipped — phase advanced. Caller proceeds
+    /// into the next quantum.
+    Advanced,
+    /// Defensive timeout fired before all peers arrived. Caller should
+    /// log + initiate shutdown rather than retry.
+    Timeout,
+    /// Phaser was shut down via [`Phaser::shutdown`]; all waiters are
+    /// woken and return this. Caller exits cleanly.
+    Shutdown,
+}
+
+/// Custom barrier-with-skip primitive. Construct once with the number of
+/// participating slots; share via `Arc` across host threads.
+pub struct Phaser {
+    /// Total participant count (constant after construction). For our
+    /// scheduler this is `HW_THREAD_COUNT = 6`.
+    party_count: u32,
+    /// Monotonic phase counter, incremented every time the phase
+    /// advances. Used as a generation marker so a slot that wakes "into"
+    /// the next phase doesn't observe the old "everyone arrived" state.
+    phase: AtomicU32,
+    /// Inner state guarded by the condvar's mutex.
+    inner: Mutex<Inner>,
+    /// Notified when a phase advances or shutdown fires.
+    cv: Condvar,
+}
+
+#[derive(Debug)]
+struct Inner {
+    arrived_or_skipped: u32,
+    shutdown: bool,
+}
+
+impl Phaser {
+    /// Create a phaser with `party_count` participants. Panics if
+    /// `party_count == 0`.
+    pub fn new(party_count: u32) -> Self {
+        assert!(party_count > 0, "phaser party_count must be > 0");
+        Self {
+            party_count,
+            phase: AtomicU32::new(0),
+            inner: Mutex::new(Inner {
+                arrived_or_skipped: 0,
+                shutdown: false,
+            }),
+            cv: Condvar::new(),
+        }
+    }
+
+    /// Get the current phase number. Useful for tests and observability.
+    pub fn current_phase(&self) -> u32 {
+        self.phase.load(Ordering::Acquire)
+    }
+
+    /// Mark this slot as not participating in the current phase. Counts
+    /// toward the advance threshold but does not block. Used when a slot
+    /// has no runnable thread and is parked waiting on
+    /// `slot_wake[i].unpark()`.
+    ///
+    /// `_slot_id` is informational (not stored); the parameter exists so
+    /// call sites stay greppable.
+    pub fn skip(&self, _slot_id: u8) {
+        self.contribute_advance();
+    }
+
+    /// Block until the phase advances or the defensive 5-second timeout
+    /// fires. Returns [`PhaserOutcome::Advanced`] on a clean phase
+    /// transition; [`Timeout`] if a peer hung; [`Shutdown`] on tear-down.
+    ///
+    /// `_slot_id` is informational (see [`Self::skip`]).
+    pub fn arrive_and_wait(&self, _slot_id: u8) -> PhaserOutcome {
+        self.arrive_and_wait_timeout(Duration::from_secs(5))
+    }
+
+    /// Same as [`Self::arrive_and_wait`] with a caller-supplied timeout.
+    pub fn arrive_and_wait_timeout(&self, timeout: Duration) -> PhaserOutcome {
+        let pre_phase = self.phase.load(Ordering::Acquire);
+        self.contribute_advance();
+        let deadline = Instant::now() + timeout;
+        let mut guard = self.inner.lock().unwrap();
+        loop {
+            if guard.shutdown {
+                return PhaserOutcome::Shutdown;
+            }
+            if self.phase.load(Ordering::Acquire) != pre_phase {
+                return PhaserOutcome::Advanced;
+            }
+            let now = Instant::now();
+            if now >= deadline {
+                return PhaserOutcome::Timeout;
+            }
+            let remaining = deadline - now;
+            let result = self.cv.wait_timeout(guard, remaining).unwrap();
+            guard = result.0;
+            if result.1.timed_out() {
+                // Loop once more to disambiguate "real timeout" vs
+                // "spurious wakeup just before the deadline".
+                if self.phase.load(Ordering::Acquire) != pre_phase {
+                    return PhaserOutcome::Advanced;
+                }
+                if guard.shutdown {
+                    return PhaserOutcome::Shutdown;
+                }
+                return PhaserOutcome::Timeout;
+            }
+        }
+    }
+
+    /// Wake every parked arriver and signal shutdown. After this, all
+    /// future and outstanding `arrive_and_wait_*` calls return
+    /// [`PhaserOutcome::Shutdown`].
+    pub fn shutdown(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.shutdown = true;
+        self.cv.notify_all();
+    }
+
+    /// Common path for both arrive-and-wait and skip: bump the
+    /// participant counter, and if we were the last one in, advance the
+    /// phase + broadcast.
+    fn contribute_advance(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.arrived_or_skipped += 1;
+        if guard.arrived_or_skipped >= self.party_count {
+            // Last one in. Reset the counter, bump the phase, broadcast.
+            guard.arrived_or_skipped = 0;
+            // `Release` on the phase store pairs with `Acquire` reads in
+            // arriving slots' wait-loop predicates.
+            self.phase.fetch_add(1, Ordering::Release);
+            self.cv.notify_all();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::sync::atomic::AtomicU32;
+    use std::thread;
+
+    /// All N participants arrive — phase advances, every arriver returns
+    /// `Advanced`.
+    #[test]
+    fn n_arrivers_all_advance() {
+        const N: u32 = 6;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        for i in 0..N {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-test-{i}"))
+                    .spawn(move || p.arrive_and_wait(i as u8))
+                    .unwrap(),
+            );
+        }
+        for h in handles {
+            assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
+        }
+        assert_eq!(p.current_phase(), 1);
+    }
+
+    /// 5 arrive + 1 skip → phase advances; arrivers see `Advanced`.
+    #[test]
+    fn skip_counts_toward_advance() {
+        const N: u32 = 6;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        for i in 0..(N - 1) {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-arrive-{i}"))
+                    .spawn(move || p.arrive_and_wait(i as u8))
+                    .unwrap(),
+            );
+        }
+        // Brief pause to let arrivers park first (exercising the
+        // skip-unblocks-arrivers path).
+        thread::sleep(Duration::from_millis(20));
+        p.skip((N - 1) as u8);
+        for h in handles {
+            assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced);
+        }
+        assert_eq!(p.current_phase(), 1);
+    }
+
+    /// Shutdown wakes parked arrivers; they return `Shutdown`.
+    #[test]
+    fn shutdown_wakes_arrivers() {
+        const N: u32 = 6;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        // Only N-1 arrive — phase will not advance.
+        for i in 0..(N - 1) {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-arrive-shutdown-{i}"))
+                    .spawn(move || p.arrive_and_wait(i as u8))
+                    .unwrap(),
+            );
+        }
+        thread::sleep(Duration::from_millis(20));
+        p.shutdown();
+        for h in handles {
+            assert_eq!(h.join().unwrap(), PhaserOutcome::Shutdown);
+        }
+    }
+
+    /// Defensive timeout: if some peers never arrive, others surface
+    /// `Timeout` rather than blocking forever.
+    #[test]
+    fn timeout_fires_when_peer_hangs() {
+        const N: u32 = 4;
+        let p = Arc::new(Phaser::new(N));
+        // Only 2 of 4 arrive — others "hang".
+        let p1 = p.clone();
+        let h1 = thread::spawn(move || {
+            p1.arrive_and_wait_timeout(Duration::from_millis(50))
+        });
+        let p2 = p.clone();
+        let h2 = thread::spawn(move || {
+            p2.arrive_and_wait_timeout(Duration::from_millis(50))
+        });
+        assert_eq!(h1.join().unwrap(), PhaserOutcome::Timeout);
+        assert_eq!(h2.join().unwrap(), PhaserOutcome::Timeout);
+    }
+
+    /// Multi-phase stress: all participants run a tight loop of
+    /// arrive_and_wait calls; after K phases they all observe the same
+    /// `current_phase()` value. Catches generation/counter resync bugs.
+    #[test]
+    fn multi_phase_progress() {
+        const N: u32 = 6;
+        const K: u32 = 1000;
+        let p = Arc::new(Phaser::new(N));
+        let counter = Arc::new(AtomicU32::new(0));
+        let mut handles = Vec::new();
+        for i in 0..N {
+            let p = p.clone();
+            let c = counter.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-multi-{i}"))
+                    .spawn(move || {
+                        for _ in 0..K {
+                            assert_eq!(
+                                p.arrive_and_wait(i as u8),
+                                PhaserOutcome::Advanced
+                            );
+                        }
+                        c.fetch_add(1, Ordering::Relaxed);
+                    })
+                    .unwrap(),
+            );
+        }
+        for h in handles {
+            h.join().unwrap();
+        }
+        assert_eq!(counter.load(Ordering::Relaxed), N);
+        assert_eq!(p.current_phase(), K);
+    }
+
+    /// Mixed skip/arrive across phases — emulates the realistic scheduler
+    /// pattern where slots become idle for some quanta.
+    #[test]
+    fn mixed_skip_and_arrive_random() {
+        const N: u32 = 6;
+        const K: u32 = 200;
+        let p = Arc::new(Phaser::new(N));
+        let mut handles = Vec::new();
+        for i in 0..N {
+            let p = p.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("phaser-mixed-{i}"))
+                    .spawn(move || {
+                        // Pseudo-random skip pattern based on slot+phase
+                        let mut state: u32 = 0x9E37_79B9u32.wrapping_add(i);
+                        for phase in 0..K {
+                            state = state.wrapping_mul(0x6C8E_9CF7).wrapping_add(phase);
+                            if state & 0xF == 0 {
+                                p.skip(i as u8);
+                            } else {
+                                let _ = p.arrive_and_wait(i as u8);
+                            }
+                        }
+                    })
+                    .unwrap(),
+            );
+        }
+        for h in handles {
+            h.join().unwrap();
+        }
+        // After K rounds with all-N participation each phase, the phase
+        // counter equals K. Each iteration contributes exactly N to the
+        // counter (split between arrive and skip).
+        assert_eq!(p.current_phase(), K);
+    }
+}
diff --git a/crates/xenia-cpu/src/reservation.rs b/crates/xenia-cpu/src/reservation.rs
new file mode 100644
index 0000000..247b8a9
--- /dev/null
+++ b/crates/xenia-cpu/src/reservation.rs
@@ -0,0 +1,424 @@
+//! Inter-thread reservation table for `lwarx`/`stwcx.` and
+//! `ldarx`/`stdcx.`.
+//!
+//! On real Xenon, each core's `lwarx` places a reservation on a 128-byte
+//! cache line; any other CPU's store to the line invalidates the
+//! reservation. `stwcx.`'s success depends on the reservation still being
+//! valid. Under M3's per-HW-thread parallelism, we need an inter-thread
+//! mechanism for the same guarantee.
+//!
+//! M2 introduces the table behind a runtime `reservations_enabled` flag
+//! (default `false`). When the flag is `false`, the interpreter's
+//! existing per-`PpcContext` `reserved_line`/`has_reservation` fields are
+//! used as-is — no inter-thread tracking. M3 flips the flag on once the
+//! per-HW-thread host threads are spawning.
+//!
+//! ## Design
+//!
+//! - **Banked AtomicU64 array** of [`NUM_LINES`] entries (4096 × 8 B =
+//!   32 KiB total). Each entry packs `(line_address, generation,
+//!   hw_id)`. A zero value means "no reservation on this bank".
+//! - **Hash function**: `(line >> 7) & (NUM_LINES - 1)`. Different lines
+//!   that map to the same bank conservatively invalidate each other's
+//!   reservations — sound (real Xenon's L2 has finite associativity and
+//!   has the same property), at the cost of slightly more `stwcx.`
+//!   failures than a perfect-mapping table would produce.
+//! - **`active_reservers: AtomicU16`** — a fast-path counter
+//!   incremented by every `lwarx` and decremented when its reservation is
+//!   either committed or invalidated. `write_u32` checks this with a
+//!   single `Relaxed` load; when zero (the common case in code that
+//!   doesn't use atomics), the invalidation hook is a one-instruction
+//!   skip.
+//! - **Generation counter**: monotonic across all reservations,
+//!   incremented atomically. 24 bits of generation packed in the slot
+//!   means 16 M reuses per slot before wraparound; at multi-million
+//!   reservations/sec sustained that's still many seconds, and a
+//!   stale-gen `stwcx.` simply fails (sound, not livelocking).
+//!
+//! ## Invariants
+//!
+//! 1. A `stwcx.(addr)` succeeds only if the line slot still holds the
+//!    same `(line, gen, hw_id)` triple the reserver stamped at `lwarx`.
+//! 2. Any plain store to a reserved line invalidates it (slot CASed to
+//!    zero). Hash-collision side-effect: a store to a different line
+//!    that maps to the same bank also invalidates — guests that observe
+//!    a `stwcx.` failure simply retry, so this is correctness-preserving.
+//! 3. `stwcx.` from a different `hw_id` than the reserver fails even if
+//!    the line and gen would otherwise match — only the originating HW
+//!    thread can commit its own reservation.
+//!
+//! Memory ordering: all CAS / store operations on the line slot use
+//! `AcqRel`; readers use `Acquire`. The store inside `stwcx.`'s payload
+//! itself (the actual data write) is the caller's responsibility — see
+//! [`crate::interpreter`]'s `stwcx.` arm.
+
+use std::sync::atomic::{AtomicU16, AtomicU64, Ordering};
+
+/// Real Xenon L2 cache-line size — the granule a reservation covers.
+pub const LINE_BYTES: u32 = 0x80;
+/// Mask to align an address to a cache-line boundary.
+pub const LINE_MASK: u32 = !(LINE_BYTES - 1);
+/// Number of bank entries in the reservation table. Power of two so the
+/// hash is a single AND. 32 KiB total at 8 B per entry.
+pub const NUM_LINES: usize = 4096;
+const HASH_MASK: u32 = (NUM_LINES as u32) - 1;
+
+/// Pack `(line_addr, generation, hw_id)` into a single u64. The packed
+/// layout is:
+///   bits 63..32: line address (we only need the high bits since the
+///                low 7 are always zero — reserved range is line-aligned)
+///   bits 31..8:  24-bit generation
+///   bits 7..0:   8-bit `hw_id`
+///
+/// A packed value of `0` means "no reservation". Since we never reserve
+/// on guest virtual address `0` (the page is unmapped) and the
+/// generation increments from `1`, zero is a safe sentinel.
+#[inline]
+pub fn pack(line_addr: u32, generation: u32, hw_id: u8) -> u64 {
+    debug_assert!(line_addr & !LINE_MASK == 0, "line_addr must be line-aligned");
+    debug_assert!(generation < (1 << 24), "generation must fit in 24 bits");
+    ((line_addr as u64) << 32)
+        | ((generation as u64 & 0xFF_FFFF) << 8)
+        | (hw_id as u64)
+}
+
+/// Inverse of [`pack`]. Returns `None` if the value is the zero sentinel
+/// (no reservation).
+#[inline]
+pub fn unpack(raw: u64) -> Option<(u32, u32, u8)> {
+    if raw == 0 {
+        return None;
+    }
+    let line = (raw >> 32) as u32;
+    let generation = ((raw >> 8) & 0xFF_FFFF) as u32;
+    let hw_id = (raw & 0xFF) as u8;
+    Some((line, generation, hw_id))
+}
+
+#[inline]
+fn hash(line_addr: u32) -> usize {
+    ((line_addr >> 7) & HASH_MASK) as usize
+}
+
+#[inline]
+fn align_to_line(addr: u32) -> u32 {
+    addr & LINE_MASK
+}
+
+/// Banked reservation table shared across all emulated HW threads. Built
+/// once per emulation instance; lives behind an `Arc` so worker host
+/// threads (M3) can hold their own clones without lifetime gymnastics.
+pub struct ReservationTable {
+    lines: Vec<AtomicU64>,
+    active_reservers: AtomicU16,
+    next_gen: AtomicU64,
+    /// Runtime activation flag. Default `false`. M2.8's
+    /// `--reservations-table` flag (or M3 spawn) flips this to `true`,
+    /// at which point the interpreter's `lwarx`/`stwcx.` arms route
+    /// through the table; otherwise they use the legacy per-`PpcContext`
+    /// reservation fields.
+    enabled: std::sync::atomic::AtomicBool,
+}
+
+impl Default for ReservationTable {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReservationTable {
+    /// Construct a fresh table with all banks empty.
+    pub fn new() -> Self {
+        let mut lines = Vec::with_capacity(NUM_LINES);
+        for _ in 0..NUM_LINES {
+            lines.push(AtomicU64::new(0));
+        }
+        Self {
+            lines,
+            active_reservers: AtomicU16::new(0),
+            // Start at 1 so the very first reservation gets a non-zero
+            // gen and the packed slot value is non-zero (zero is the
+            // "no reservation" sentinel).
+            next_gen: AtomicU64::new(1),
+            enabled: std::sync::atomic::AtomicBool::new(false),
+        }
+    }
+
+    /// Activate the table. The interpreter's `lwarx`/`stwcx.` arms will
+    /// route through this table on subsequent dispatches. Idempotent.
+    pub fn enable(&self) {
+        self.enabled
+            .store(true, std::sync::atomic::Ordering::Release);
+    }
+
+    /// Deactivate the table. The interpreter falls back to per-`PpcContext`
+    /// reservation fields. Idempotent.
+    pub fn disable(&self) {
+        self.enabled
+            .store(false, std::sync::atomic::Ordering::Release);
+    }
+
+    /// Whether the table is currently active. The interpreter consults
+    /// this on every `lwarx`/`stwcx.` to decide which path runs.
+    pub fn is_enabled(&self) -> bool {
+        self.enabled.load(std::sync::atomic::Ordering::Acquire)
+    }
+
+    /// True when at least one reservation is currently outstanding.
+    /// Plain `write_u32` consults this to skip the invalidation hook
+    /// when no thread holds a reservation — the common case for
+    /// non-atomic code.
+    #[inline]
+    pub fn has_active_reservers(&self) -> bool {
+        self.active_reservers.load(Ordering::Relaxed) > 0
+    }
+
+    /// `lwarx(addr)` — claim a reservation on the line containing `addr`.
+    /// Returns the generation stamped into the slot; the interpreter
+    /// stores this alongside the per-`PpcContext` `has_reservation` bit
+    /// so a subsequent `stwcx.` can verify the same gen still holds.
+    ///
+    /// If a different reservation already occupied the bank, it's
+    /// silently overwritten — that thread's `stwcx.` will fail because
+    /// the slot no longer matches its stamped gen. Matches Xenon
+    /// behavior (a different core's lwarx on the same line displaces
+    /// any prior reservation).
+    pub fn reserve(&self, addr: u32, hw_id: u8) -> u32 {
+        let line = align_to_line(addr);
+        let generation = (self
+            .next_gen
+            .fetch_add(1, Ordering::Relaxed)
+            & 0xFF_FFFF) as u32;
+        let new_raw = pack(line, generation, hw_id);
+        // Release: prior reads of the reservation target should
+        // happen-before any thread that observes the new slot value.
+        let prev = self.lines[hash(line)].swap(new_raw, Ordering::AcqRel);
+        // If the previous slot was non-zero, the displaced reserver is
+        // implicitly invalidated — decrement the active counter for it.
+        // Else, increment for our new reservation. Net effect: the
+        // counter equals the number of *bank slots* with a non-zero
+        // value, which is an upper bound on actual reservers.
+        if prev == 0 {
+            self.active_reservers.fetch_add(1, Ordering::Relaxed);
+        }
+        generation
+    }
+
+    /// `stwcx.(addr)` — try to commit a reservation. Returns `true` if
+    /// the slot still holds `(line, my_gen, my_hw_id)` (in which case
+    /// it's CAS'd back to zero, releasing the bank), `false` otherwise.
+    /// The data store itself is the caller's responsibility — see
+    /// [`crate::interpreter`]'s `stwcx.` arm.
+    pub fn try_commit(&self, addr: u32, my_gen: u32, my_hw_id: u8) -> bool {
+        let line = align_to_line(addr);
+        let expected = pack(line, my_gen, my_hw_id);
+        match self.lines[hash(line)].compare_exchange(
+            expected,
+            0,
+            Ordering::AcqRel,
+            Ordering::Relaxed,
+        ) {
+            Ok(_) => {
+                // Successfully released the slot; decrement the active
+                // count.
+                self.active_reservers.fetch_sub(1, Ordering::Relaxed);
+                true
+            }
+            Err(_) => false,
+        }
+    }
+
+    /// Hook for plain (non-reserving) stores: invalidate any
+    /// reservation on the containing line. Cheap when the bank is
+    /// already empty (single Acquire load + branch).
+    pub fn invalidate_for_write(&self, addr: u32) {
+        let line = align_to_line(addr);
+        let bank = &self.lines[hash(line)];
+        let prev = bank.load(Ordering::Acquire);
+        if prev == 0 {
+            return;
+        }
+        // Verify the slot still holds a reservation on *this* line
+        // before clearing — hash collisions mean the bank may hold a
+        // reservation on an unrelated line that maps to the same slot.
+        // Real Xenon has the same property (limited L2 associativity);
+        // we mirror it here. A spurious bank match invalidates a
+        // different line's reservation; the affected `stwcx.` retries —
+        // sound, slightly less efficient.
+        if let Some((bank_line, _generation, _hw)) = unpack(prev) {
+            if bank_line != line {
+                // Different line in the same bank — leave it alone (we
+                // chose not to invalidate cross-line collisions to
+                // reduce false-fail noise; real-HW behavior is similar
+                // since L2 associativity sets cross-line constraints).
+                return;
+            }
+        }
+        // CAS-clear the bank if it still holds the value we observed.
+        // If a concurrent `stwcx.` or `reserve` raced with us, the CAS
+        // fails — that's fine; the line slot is now in a different
+        // state and the displaced reservation will be picked up there.
+        if bank
+            .compare_exchange(prev, 0, Ordering::AcqRel, Ordering::Relaxed)
+            .is_ok()
+        {
+            self.active_reservers.fetch_sub(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Drop a per-`PpcContext` reservation without committing. Called
+    /// when the interpreter clears `has_reservation` due to a
+    /// non-`stwcx.` event (context switch, exception, etc.). Safe to
+    /// call when the table doesn't hold our reservation anymore (the
+    /// CAS simply fails).
+    pub fn release(&self, addr: u32, my_gen: u32, my_hw_id: u8) {
+        let _ = self.try_commit(addr, my_gen, my_hw_id);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use std::thread;
+
+    #[test]
+    fn pack_unpack_roundtrip() {
+        let raw = pack(0x1000_0000, 42, 5);
+        let (line, generation, hw) = unpack(raw).unwrap();
+        assert_eq!(line, 0x1000_0000);
+        assert_eq!(generation, 42);
+        assert_eq!(hw, 5);
+    }
+
+    #[test]
+    fn unpack_zero_is_none() {
+        assert!(unpack(0).is_none());
+    }
+
+    #[test]
+    fn reserve_then_commit_succeeds() {
+        let t = ReservationTable::new();
+        let gn = t.reserve(0x1234, 0);
+        assert!(t.try_commit(0x1234, gn, 0));
+        // Already released — second commit fails.
+        assert!(!t.try_commit(0x1234, gn, 0));
+    }
+
+    #[test]
+    fn other_hw_id_cannot_commit() {
+        let t = ReservationTable::new();
+        let gn = t.reserve(0x1234, 0);
+        assert!(
+            !t.try_commit(0x1234, gn, 1),
+            "stwcx. from a different hw_id must fail"
+        );
+        // Original owner can still commit.
+        assert!(t.try_commit(0x1234, gn, 0));
+    }
+
+    #[test]
+    fn lwarx_displaces_prior_reservation() {
+        let t = ReservationTable::new();
+        let g0 = t.reserve(0x1234, 0);
+        // Different HW thread's lwarx on the same line.
+        let g1 = t.reserve(0x1234, 1);
+        // Original reserver's stwcx. fails because the gen changed.
+        assert!(!t.try_commit(0x1234, g0, 0));
+        // New reserver's stwcx. succeeds.
+        assert!(t.try_commit(0x1234, g1, 1));
+    }
+
+    #[test]
+    fn invalidate_clears_matching_reservation() {
+        let t = ReservationTable::new();
+        let gn = t.reserve(0x1234, 0);
+        t.invalidate_for_write(0x1238); // same line as 0x1234
+        assert!(!t.try_commit(0x1234, gn, 0));
+        assert_eq!(t.active_reservers.load(Ordering::Relaxed), 0);
+    }
+
+    #[test]
+    fn invalidate_different_line_in_same_bank_is_noop() {
+        let t = ReservationTable::new();
+        // Force a hash collision: addr A and addr B with same hash but
+        // different line addresses.
+        let line_a = 0x0000_1000;
+        let line_b = line_a + ((NUM_LINES as u32) << 7); // +0x80000 → same hash
+        assert_eq!(hash(line_a), hash(line_b));
+        let gn = t.reserve(line_a, 0);
+        // Invalidating line_b must NOT clear line_a's reservation.
+        t.invalidate_for_write(line_b);
+        assert!(t.try_commit(line_a, gn, 0));
+    }
+
+    #[test]
+    fn has_active_reservers_tracks_count() {
+        let t = ReservationTable::new();
+        assert!(!t.has_active_reservers());
+        let g0 = t.reserve(0x1000, 0);
+        assert!(t.has_active_reservers());
+        let g1 = t.reserve(0x2000, 1);
+        assert!(t.has_active_reservers());
+        t.try_commit(0x1000, g0, 0);
+        assert!(t.has_active_reservers());
+        t.try_commit(0x2000, g1, 1);
+        assert!(!t.has_active_reservers());
+    }
+
+    /// Stress test: 8 host threads each loop reserve+stwcx on the same
+    /// line. Exactly one stwcx per round can win; the others fail and
+    /// retry. The total number of *successful* commits across N
+    /// outer iterations equals N (one winner per round).
+    ///
+    /// This proves the table's mutual-exclusion property: at most one
+    /// thread's stwcx. on a given line can succeed between two events
+    /// that would invalidate the line.
+    #[test]
+    fn concurrent_lwarx_stwcx_serializes() {
+        let t = Arc::new(ReservationTable::new());
+        const ROUNDS: u32 = 1000;
+        const THREADS: u8 = 8;
+        let total_successes = Arc::new(AtomicU64::new(0));
+
+        let mut handles = Vec::new();
+        for hw_id in 0..THREADS {
+            let t_clone = t.clone();
+            let s_clone = total_successes.clone();
+            handles.push(
+                thread::Builder::new()
+                    .name(format!("res-stress-{hw_id}"))
+                    .spawn(move || {
+                        let mut wins = 0u64;
+                        for _ in 0..ROUNDS {
+                            let gn = t_clone.reserve(0x1234_5678, hw_id);
+                            if t_clone.try_commit(0x1234_5678, gn, hw_id) {
+                                wins += 1;
+                            }
+                        }
+                        s_clone.fetch_add(wins, Ordering::Relaxed);
+                    })
+                    .expect("spawn"),
+            );
+        }
+        for h in handles {
+            h.join().expect("join");
+        }
+        let total = total_successes.load(Ordering::Relaxed);
+        // Lower bound: every round had at least one winner — but races
+        // can cause some rounds to have zero (all threads' reservations
+        // got displaced before any could commit). Assert progress: at
+        // least 10% of attempts succeed, and active_reservers is back
+        // to zero.
+        let attempts = ROUNDS as u64 * THREADS as u64;
+        assert!(
+            total > attempts / 10,
+            "expected at least 10% successful commits, got {total}/{attempts}"
+        );
+        assert_eq!(
+            t.active_reservers.load(Ordering::Relaxed),
+            0,
+            "all reservations should have been resolved"
+        );
+    }
+}
diff --git a/crates/xenia-cpu/src/scheduler.rs b/crates/xenia-cpu/src/scheduler.rs
new file mode 100644
index 0000000..1f1d68a
--- /dev/null
+++ b/crates/xenia-cpu/src/scheduler.rs
@@ -0,0 +1,1919 @@
+//! Round-robin scheduler over 6 HW threads with per-slot runqueues.
+//!
+//! Execution is serialized on a single host thread (the interpreter thread;
+//! `GuestMemory` is pinned and deliberately not thread-safe). The scheduler
+//! is a pure data container — kernel code parks, wakes, and mutates state
+//! through its public methods; it knows nothing about kernel objects.
+//!
+//! ## Model (post-Axis-1)
+//!
+//! - `HW_THREAD_COUNT = 6`, matching real Xenon hardware (3 cores × 2 SMT).
+//! - Each `HwSlot` carries a runqueue `Vec<GuestThread>` — any state,
+//!   `pick_runnable` filters Ready/ServicingIrq when choosing the live thread.
+//! - A `GuestThread` owns its own `PpcContext` inline. The live register
+//!   file is always whichever thread the slot has pinned as running — no
+//!   memcpy on context switch.
+//! - `ThreadRef { hw_id, idx }` is the stable identity used in waiter lists
+//!   and anywhere a specific thread needs to be addressed across slot
+//!   boundaries. Positional refs are cheap but **must** be fixed up after
+//!   `swap_remove` (Axis 4 affinity migration does this explicitly).
+//!
+//! Every scheduler round: for each slot with a runnable thread, pick the
+//! highest-priority Ready thread and advance it one guest instruction (or
+//! one import-thunk dispatch). Blocked/Exited threads stay resident in the
+//! runqueue so their `ThreadRef` doesn't shift under kernel waiter lists.
+
+use crate::context::PpcContext;
+
+/// Number of emulated HW threads. Real Xbox 360 Xenon = 3 cores × 2 SMT = 6.
+pub const HW_THREAD_COUNT: usize = 6;
+
+/// Guest thread id assigned to the initial (module-entry) guest thread.
+pub const INITIAL_GUEST_TID: u32 = 1;
+
+/// Default per-thread instruction quantum. Consumed by Axis 3 (`decrement_quantum`);
+/// Axis 1 carries the field on every thread but doesn't decrement yet.
+pub const QUANTUM_DEFAULT: u32 = 50_000;
+
+/// Above this depth, `spawn` prunes `Exited` entries from a slot's runqueue
+/// before pushing the new thread. Keeps peer `ThreadRef`s stable on the
+/// common (low-depth) path — a game that spawns a handful of long-lived
+/// workers never triggers a compaction; a game that rapidly churns threads
+/// gets one when the slot fills up.
+const PRUNE_DEPTH_THRESHOLD: usize = 4;
+
+/// Stable identity for a guest thread across all scheduler tables.
+///
+/// The positional `idx` is only valid while the source slot's runqueue
+/// has not been mutated by a `swap_remove`. All sites that do so (Axis 4
+/// affinity migration, `prune_exited`) must fix up every `ThreadRef` they
+/// invalidate.
+///
+/// **M2.3 generation packing.** Under M3's per-HW-thread parallelism, an
+/// `idx` reused after `swap_remove` could match a stale `ThreadRef` held
+/// in another thread's waiter list (the classic ABA hazard). The
+/// `generation` byte distinguishes such reuses. M2 introduces the field
+/// (set to `0` on fresh spawns) without yet bumping it — no concurrent
+/// remove paths exist before M3. The migration-fixup site at
+/// [`MigrationFixup::apply`] will bump generations once M3 lands.
+///
+/// Layout: 1 + 1 + 2 = 4 bytes (no padding). 256 reuses per slot before
+/// wraparound; with `PRUNE_DEPTH_THRESHOLD = 4` keeping slots shallow,
+/// that is well above any realistic churn rate.
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Default)]
+pub struct ThreadRef {
+    pub hw_id: u8,
+    pub generation: u8,
+    pub idx: u16,
+}
+
+impl ThreadRef {
+    /// Construct a `ThreadRef` with `generation = 0`. Used by every
+    /// fresh-spawn / re-bind site that doesn't track generations
+    /// directly. Sites that DO track generations (the migration-fixup
+    /// path under M3) construct via struct literals so they're greppable.
+    pub const fn new(hw_id: u8, idx: u16) -> Self {
+        Self {
+            hw_id,
+            idx,
+            generation: 0,
+        }
+    }
+
+    /// Construct a `ThreadRef` with an explicit generation. Used by the
+    /// migration-fixup path at M3.
+    pub const fn with_generation(hw_id: u8, idx: u16, generation: u8) -> Self {
+        Self {
+            hw_id,
+            idx,
+            generation,
+        }
+    }
+}
+
+/// A guest thread and everything needed to schedule, park, and wake it.
+pub struct GuestThread {
+    pub ctx: PpcContext,
+    pub state: HwState,
+    pub tid: u32,
+    pub thread_handle: Option<u32>,
+    pub stack_base: u32,
+    pub stack_size: u32,
+    pub pcr_base: u32,
+    pub tls_base: u32,
+    /// Per-thread TLS slot values; `KeTlsGetValue/SetValue` route through
+    /// `Scheduler::tls_{get,set}` which index this on the currently-running thread.
+    pub tls_values: Vec<u64>,
+    /// Suspend counter — `NtSuspendThread` increments, `NtResumeThread`
+    /// decrements, only unblocks at zero.
+    pub suspend_count: u32,
+    /// NT-style priority, signed. Higher wins within a slot. Default 0.
+    pub priority: i32,
+    /// Set bit i = thread may run on slot i. 0 normalizes to 0xFF (any).
+    pub affinity_mask: u8,
+    /// Hint from `KeSetIdealProcessor`. Axis 5 honors on spawn; Axis 1
+    /// carries the field.
+    pub ideal_processor: Option<u8>,
+    /// Axis 3 instruction budget. Decremented per retired step on this
+    /// thread; on zero, slot rotates within same-priority tier.
+    pub quantum_remaining: u32,
+}
+
+impl GuestThread {
+    fn default_fields() -> Self {
+        Self {
+            ctx: PpcContext::new(),
+            state: HwState::Idle,
+            tid: 0,
+            thread_handle: None,
+            stack_base: 0,
+            stack_size: 0,
+            pcr_base: 0,
+            tls_base: 0,
+            tls_values: Vec::new(),
+            suspend_count: 0,
+            priority: 0,
+            affinity_mask: 0xFF,
+            ideal_processor: None,
+            quantum_remaining: QUANTUM_DEFAULT,
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum HwState {
+    /// Slot slot has no running thread (used only for `HwSlot::idle_ctx`'s
+    /// conceptual state — live threads never sit in `Idle`).
+    Idle,
+    Ready,
+    Blocked(BlockReason),
+    Exited(u32),
+    /// Graphics-interrupt servicing state. The thread was
+    /// `Blocked(reason)` when an IRQ was injected; we flipped it to
+    /// `ServicingIrq(reason)` so the scheduler will run the callback,
+    /// carrying the prior block reason for the IRQ-return path to consult.
+    /// On return to `LR_HALT_SENTINEL` the main loop restores to
+    /// `Blocked(reason)` — **unless** something during the callback
+    /// (e.g. `KeSetEvent → wake`) flipped this to `Ready`, in which case
+    /// the wait was resolved and we leave it runnable.
+    ServicingIrq(BlockReason),
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum BlockReason {
+    Suspended,
+    WaitAny {
+        handles: Vec<u32>,
+        deadline: Option<u64>,
+    },
+    WaitAll {
+        handles: Vec<u32>,
+        deadline: Option<u64>,
+    },
+    DelayUntil(u64),
+    CriticalSection(u32),
+}
+
+/// Sink for PCR+0x2C writes — the scheduler writes the guest-visible
+/// current-processor-id here at spawn and Axis 4 rewrites on affinity
+/// migration. Implemented by `xenia-kernel` for `GuestMemory`; keeping it
+/// an abstract trait avoids pulling `xenia_memory` into `xenia_cpu`.
+pub trait PcrWriter {
+    fn write_pcr_id(&mut self, pcr_base: u32, hw_id: u8);
+}
+
+/// Per-slot runqueue + the index of the thread currently pinned-running.
+pub struct HwSlot {
+    pub runqueue: Vec<GuestThread>,
+    pub running_idx: Option<usize>,
+    /// Sentinel context returned by compat accessors when the slot has no
+    /// running thread. Keeps the `ctx(hw_id)` API safe from diagnostic
+    /// paths that run between scheduling passes.
+    idle_ctx: PpcContext,
+    /// Same-shape sentinel state.
+    idle_state: HwState,
+}
+
+impl Default for HwSlot {
+    fn default() -> Self {
+        Self {
+            runqueue: Vec::new(),
+            running_idx: None,
+            idle_ctx: PpcContext::new(),
+            idle_state: HwState::Idle,
+        }
+    }
+}
+
+impl HwSlot {
+    /// Index of the highest-priority Ready/ServicingIrq thread in this
+    /// slot's runqueue. Tiebreak: prefer lower index (deterministic).
+    pub fn pick_runnable(&self) -> Option<usize> {
+        self.runqueue
+            .iter()
+            .enumerate()
+            .filter(|(_, t)| matches!(t.state, HwState::Ready | HwState::ServicingIrq(_)))
+            .max_by_key(|(i, t)| (t.priority, -(*i as i64)))
+            .map(|(i, _)| i)
+    }
+
+    /// How many non-Exited threads currently live on this slot (used by
+    /// placement policies).
+    pub fn live_depth(&self) -> usize {
+        self.runqueue
+            .iter()
+            .filter(|t| !matches!(t.state, HwState::Exited(_)))
+            .count()
+    }
+
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum OrderMode {
+    Fixed,
+    Seeded { seed: u64 },
+}
+
+impl OrderMode {
+    pub fn from_env() -> Self {
+        match std::env::var("XENIA_SCHED_ORDER").ok().as_deref() {
+            Some("random") | Some("Random") | Some("RANDOM") => {
+                let seed = std::env::var("XENIA_SCHED_SEED")
+                    .ok()
+                    .and_then(|s| s.parse::<u64>().ok())
+                    .unwrap_or(0xC0FFEE_C0FFEE);
+                OrderMode::Seeded { seed }
+            }
+            _ => OrderMode::Fixed,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum RoundOutcome {
+    Progressed,
+    Slept,
+    Deadlock,
+    MainExited,
+}
+
+/// Parameters for `Scheduler::spawn`. The caller allocates the stack/PCR/
+/// TLS blocks in guest memory first, then hands these addresses here.
+#[derive(Debug)]
+pub struct SpawnParams {
+    pub entry: u32,
+    pub start_context: u32,
+    pub stack_base: u32,
+    pub stack_size: u32,
+    pub pcr_base: u32,
+    pub tls_base: u32,
+    pub thread_handle: u32,
+    pub guest_tid: u32,
+    pub create_suspended: bool,
+    pub is_initial: bool,
+    pub tls_slot_count: u32,
+    /// Set bit i = thread may land on slot i. 0 normalizes to 0xFF.
+    pub affinity_mask: u8,
+    /// NT-style signed priority; default 0.
+    pub priority: i32,
+    /// Preferred slot; Axis 5 spawn honors if allowed by affinity mask.
+    pub ideal_processor: Option<u8>,
+}
+
+impl Default for SpawnParams {
+    fn default() -> Self {
+        Self {
+            entry: 0,
+            start_context: 0,
+            stack_base: 0,
+            stack_size: 0,
+            pcr_base: 0,
+            tls_base: 0,
+            thread_handle: 0,
+            guest_tid: 0,
+            create_suspended: false,
+            is_initial: false,
+            tls_slot_count: 0,
+            affinity_mask: 0xFF,
+            priority: 0,
+            ideal_processor: None,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum SpawnError {
+    NoFreeHwThread,
+}
+
+/// Side information returned by `set_affinity_ref` so the kernel layer
+/// can walk its waiter lists and retarget any `ThreadRef`s invalidated
+/// by the `swap_remove` on the source slot.
+#[derive(Debug, Copy, Clone)]
+pub struct MigrationFixup {
+    pub source_hw: u8,
+    pub promoted_old_idx: u16,
+    pub promoted_new_idx: u16,
+    pub migrated_old_ref: ThreadRef,
+    pub migrated_new_ref: ThreadRef,
+}
+
+impl MigrationFixup {
+    /// Apply the fixup to a single `ThreadRef` reference. Idempotent.
+    pub fn apply(&self, r: &mut ThreadRef) {
+        if *r == self.migrated_old_ref {
+            *r = self.migrated_new_ref;
+        } else if r.hw_id == self.source_hw && r.idx == self.promoted_old_idx {
+            r.idx = self.promoted_new_idx;
+        }
+    }
+}
+
+pub struct Scheduler {
+    pub slots: [HwSlot; HW_THREAD_COUNT],
+    pub round_count: u64,
+    /// Currently-stepping thread. Set by `begin_slot_visit`, cleared by
+    /// `end_slot_visit`. Kernel exports reach through this to learn which
+    /// thread they're running on.
+    pub current: Option<ThreadRef>,
+    order: OrderMode,
+    rng_state: u64,
+    /// Sorted by deadline ascending. Scheduler wakes the first entry via
+    /// `advance_to_next_wake` when a round finds nothing runnable.
+    timed_waits: Vec<(u64, ThreadRef)>,
+    /// Global count of TLS slots allocated — `spawn` pre-sizes new threads'
+    /// `tls_values` to this.
+    tls_slot_count: usize,
+    /// Axis 2: bit i set ⇒ slot i has at least one Ready/ServicingIrq
+    /// thread. `round_schedule` uses this to skip empty slots cheaply;
+    /// maintained by state-mutating methods via `recompute_slot_runnable`.
+    non_empty_runnable: u8,
+    /// Axis 2: rolling round-robin cursor. Each `round_schedule` call
+    /// emits slot ids starting at `(rotation_cursor + i) % 6`, then
+    /// advances the cursor by one so the next round begins from the
+    /// following slot. Guarantees every non-empty slot gets an equal
+    /// share of round leads over time.
+    rotation_cursor: u8,
+    /// M3.7 — optional reservation table installed by the kernel after
+    /// scheduler construction. When present, [`Self::spawn`] and
+    /// [`Self::install_initial_thread`] populate each `PpcContext`'s
+    /// `reservation_table` field so the interpreter's `lwarx`/`stwcx.`
+    /// arms can route through the table.
+    reservation_table: Option<std::sync::Arc<crate::ReservationTable>>,
+}
+
+impl Scheduler {
+    /// Build a scheduler with all slots empty. Callers (usually
+    /// `KernelState::install_initial_thread`) push the initial guest
+    /// thread onto slot 0 before stepping.
+    pub fn new() -> Self {
+        let order = OrderMode::from_env();
+        let rng_state = match order {
+            OrderMode::Fixed => 0,
+            OrderMode::Seeded { seed } => seed.max(1),
+        };
+        Scheduler {
+            slots: std::array::from_fn(|_| HwSlot::default()),
+            round_count: 0,
+            current: None,
+            order,
+            rng_state,
+            timed_waits: Vec::new(),
+            tls_slot_count: 0,
+            non_empty_runnable: 0,
+            rotation_cursor: 0,
+            reservation_table: None,
+        }
+    }
+
+    /// M3.7 — install a shared reservation table. Subsequent
+    /// `spawn`/`install_initial_thread` calls will populate each
+    /// `PpcContext::reservation_table` with a clone. Idempotent;
+    /// passing `None` clears the binding (existing threads keep their
+    /// previously-cloned Arcs).
+    pub fn set_reservation_table(
+        &mut self,
+        table: Option<std::sync::Arc<crate::ReservationTable>>,
+    ) {
+        self.reservation_table = table;
+    }
+
+    /// Recompute the runnable bit for one slot. Cheap — scans the slot's
+    /// runqueue once. Call at the tail of any method that may change
+    /// whether the slot has a Ready/ServicingIrq member.
+    fn recompute_slot_runnable(&mut self, hw_id: u8) {
+        let any = self.slots[hw_id as usize]
+            .runqueue
+            .iter()
+            .any(|t| matches!(t.state, HwState::Ready | HwState::ServicingIrq(_)));
+        if any {
+            self.non_empty_runnable |= 1 << hw_id;
+        } else {
+            self.non_empty_runnable &= !(1 << hw_id);
+        }
+    }
+
+    // ----- Compat accessors (preserve the pre-Axis-1 hw_threads[i].ctx pattern) -----
+
+    /// Read-only context of the currently-running thread on `hw_id`.
+    pub fn ctx(&self, hw_id: u8) -> &PpcContext {
+        let slot = &self.slots[hw_id as usize];
+        match slot.running_idx {
+            Some(i) if i < slot.runqueue.len() => &slot.runqueue[i].ctx,
+            _ => &slot.idle_ctx,
+        }
+    }
+
+    /// Mutable context of the currently-running thread on `hw_id`.
+    pub fn ctx_mut(&mut self, hw_id: u8) -> &mut PpcContext {
+        let slot = &mut self.slots[hw_id as usize];
+        match slot.running_idx {
+            Some(i) if i < slot.runqueue.len() => &mut slot.runqueue[i].ctx,
+            _ => &mut slot.idle_ctx,
+        }
+    }
+
+    /// Mutable context addressed by `ThreadRef` — bypasses `running_idx`
+    /// so callers (deadlock-recovery, `call_export` return, Axis 4
+    /// migration) can touch a specific thread even when it isn't the one
+    /// the slot has pinned.
+    pub fn ctx_mut_ref(&mut self, r: ThreadRef) -> &mut PpcContext {
+        &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize].ctx
+    }
+
+    pub fn state(&self, hw_id: u8) -> &HwState {
+        let slot = &self.slots[hw_id as usize];
+        match slot.running_idx {
+            Some(i) if i < slot.runqueue.len() => &slot.runqueue[i].state,
+            _ => &slot.idle_state,
+        }
+    }
+
+    pub fn state_mut(&mut self, hw_id: u8) -> &mut HwState {
+        let slot = &mut self.slots[hw_id as usize];
+        match slot.running_idx {
+            Some(i) if i < slot.runqueue.len() => &mut slot.runqueue[i].state,
+            _ => &mut slot.idle_state,
+        }
+    }
+
+    pub fn tid(&self, hw_id: u8) -> Option<u32> {
+        let slot = &self.slots[hw_id as usize];
+        slot.running_idx.and_then(|i| slot.runqueue.get(i).map(|t| t.tid))
+    }
+
+    pub fn thread_handle(&self, hw_id: u8) -> Option<u32> {
+        let slot = &self.slots[hw_id as usize];
+        slot.running_idx
+            .and_then(|i| slot.runqueue.get(i).and_then(|t| t.thread_handle))
+    }
+
+    pub fn tls_values(&self, hw_id: u8) -> Option<&Vec<u64>> {
+        let slot = &self.slots[hw_id as usize];
+        slot.running_idx.and_then(|i| slot.runqueue.get(i).map(|t| &t.tls_values))
+    }
+
+    pub fn suspend_count_mut(&mut self, hw_id: u8) -> Option<&mut u32> {
+        let slot = &mut self.slots[hw_id as usize];
+        match slot.running_idx {
+            Some(i) if i < slot.runqueue.len() => Some(&mut slot.runqueue[i].suspend_count),
+            _ => None,
+        }
+    }
+
+    /// Compat: most pre-Axis-1 code reaches for `current_hw_id` as an
+    /// `Option<u8>`. We keep it as a method that derives from `current`.
+    #[inline]
+    pub fn current_hw_id(&self) -> Option<u8> {
+        self.current.map(|r| r.hw_id)
+    }
+
+    /// Panics if called outside a step.
+    #[inline]
+    pub fn current(&self) -> u8 {
+        self.current.expect("no current thread").hw_id
+    }
+
+    /// Panics if called outside a step.
+    #[inline]
+    pub fn current_ref(&self) -> ThreadRef {
+        self.current.expect("no current thread")
+    }
+
+    // ----- Guest-thread lookup -----
+
+    /// Find the `ThreadRef` of the (non-Exited) thread with `tid`.
+    pub fn find_by_tid(&self, tid: u32) -> Option<ThreadRef> {
+        for (hw_id, slot) in self.slots.iter().enumerate() {
+            for (idx, t) in slot.runqueue.iter().enumerate() {
+                if t.tid == tid && !matches!(t.state, HwState::Exited(_)) {
+                    return Some(ThreadRef::new(hw_id as u8, idx as u16));
+                }
+            }
+        }
+        None
+    }
+
+    /// Find the `ThreadRef` of the (non-Exited) thread with `thread_handle`.
+    pub fn find_by_handle(&self, handle: u32) -> Option<ThreadRef> {
+        for (hw_id, slot) in self.slots.iter().enumerate() {
+            for (idx, t) in slot.runqueue.iter().enumerate() {
+                if t.thread_handle == Some(handle)
+                    && !matches!(t.state, HwState::Exited(_))
+                {
+                    return Some(ThreadRef::new(hw_id as u8, idx as u16));
+                }
+            }
+        }
+        None
+    }
+
+    /// Thread pointer addressed by ThreadRef. Panics if the ref is out of
+    /// bounds — only call with refs sourced from a live scheduler lookup
+    /// (`find_by_*`, `current`).
+    pub fn thread(&self, r: ThreadRef) -> &GuestThread {
+        &self.slots[r.hw_id as usize].runqueue[r.idx as usize]
+    }
+
+    pub fn thread_mut(&mut self, r: ThreadRef) -> &mut GuestThread {
+        &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize]
+    }
+
+    /// Bounds-checked variant for code paths that accept potentially-stale
+    /// refs from external storage (waiter lists that may survive a slot
+    /// compaction, test fixtures). Returns None on out-of-bounds.
+    pub fn try_thread_mut(&mut self, r: ThreadRef) -> Option<&mut GuestThread> {
+        self.slots
+            .get_mut(r.hw_id as usize)
+            .and_then(|slot| slot.runqueue.get_mut(r.idx as usize))
+    }
+
+    // ----- Spawn -----
+
+    /// Install a new guest thread on an affinity-permitted slot with the
+    /// lowest live depth. Writes `PCR+0x2C = hw_id` via `mem`. Returns
+    /// the assigned `hw_id`.
+    ///
+    /// Initial threads land on slot 0 (hardware convention).
+    pub fn spawn<W: PcrWriter>(
+        &mut self,
+        params: SpawnParams,
+        mem: &mut W,
+    ) -> Result<u8, SpawnError> {
+        let mask = if params.affinity_mask == 0 {
+            0xFF
+        } else {
+            params.affinity_mask
+        };
+
+        // Axis 5: placement order — initial always slot 0; explicit
+        // ideal (if the mask allows it) wins; otherwise least-depth
+        // among mask-allowed slots.
+        let slot_id: u8 = if params.is_initial {
+            0
+        } else if let Some(ideal) = params.ideal_processor
+            && (mask & (1u8 << ideal)) != 0
+        {
+            ideal
+        } else {
+            self.pick_least_depth_slot(mask)
+                .ok_or(SpawnError::NoFreeHwThread)?
+        };
+
+        // Compact Exited entries if this slot is approaching saturation.
+        // Only safe to do when no ThreadRef outside of the scheduler is
+        // currently held to exited entries on this slot — kernel waiter
+        // lists drop refs when wake fires, and Exited threads are never
+        // picked for stepping, so compaction is a no-op for live peers.
+        self.prune_exited_if_needed(slot_id);
+
+        let mut t = GuestThread::default_fields();
+        t.ctx.pc = params.entry;
+        let sp_top = (params.stack_base as u64).saturating_add(params.stack_size as u64);
+        t.ctx.gpr[1] = sp_top.saturating_sub(0x100) & !0xFu64;
+        t.ctx.gpr[2] = 0x2000_0000;
+        t.ctx.gpr[3] = params.start_context as u64;
+        t.ctx.gpr[13] = params.pcr_base as u64;
+        t.ctx.msr = 0x9030;
+        t.ctx.thread_id = params.guest_tid;
+        t.tid = params.guest_tid;
+        t.thread_handle = Some(params.thread_handle);
+        t.state = if params.create_suspended {
+            HwState::Blocked(BlockReason::Suspended)
+        } else {
+            HwState::Ready
+        };
+        t.stack_base = params.stack_base;
+        t.stack_size = params.stack_size;
+        t.pcr_base = params.pcr_base;
+        t.tls_base = params.tls_base;
+        let tls_count = params.tls_slot_count as usize;
+        let tls_count = tls_count.max(self.tls_slot_count);
+        t.tls_values = vec![0; tls_count];
+        t.suspend_count = if params.create_suspended { 1 } else { 0 };
+        t.priority = params.priority;
+        t.affinity_mask = mask;
+        t.ideal_processor = params.ideal_processor;
+        // M3.7 — populate the inter-thread reservation handle + slot id
+        // so the interpreter can route lwarx/stwcx through the table.
+        t.ctx.hw_id = slot_id;
+        t.ctx.reservation_table = self.reservation_table.clone();
+
+        self.slots[slot_id as usize].runqueue.push(t);
+        mem.write_pcr_id(params.pcr_base, slot_id);
+        self.recompute_slot_runnable(slot_id);
+
+        tracing::info!(
+            "spawn: tid={} on hw={} entry={:#010x} start_ctx={:#010x} suspended={} pri={} mask={:#04x}",
+            params.guest_tid,
+            slot_id,
+            params.entry,
+            params.start_context,
+            params.create_suspended,
+            params.priority,
+            mask,
+        );
+        Ok(slot_id)
+    }
+
+    /// Install the initial (module-entry) guest thread on slot 0 with an
+    /// externally-prepared register file. Unlike `spawn`, this does not
+    /// reset ctx — the app has already set up MSR, r1/r13/etc. for the
+    /// XEX bootstrap.
+    pub fn install_initial_thread<W: PcrWriter>(
+        &mut self,
+        ctx: PpcContext,
+        stack_base: u32,
+        stack_size: u32,
+        pcr_base: u32,
+        tls_base: u32,
+        thread_handle: u32,
+        mem: &mut W,
+    ) {
+        let mut t = GuestThread::default_fields();
+        t.ctx = ctx;
+        // M3.7 — initial thread on slot 0; same wiring as `spawn`.
+        t.ctx.hw_id = 0;
+        t.ctx.reservation_table = self.reservation_table.clone();
+        t.state = HwState::Ready;
+        t.tid = INITIAL_GUEST_TID;
+        t.thread_handle = Some(thread_handle);
+        t.stack_base = stack_base;
+        t.stack_size = stack_size;
+        t.pcr_base = pcr_base;
+        t.tls_base = tls_base;
+        t.tls_values = vec![0; self.tls_slot_count];
+        self.slots[0].runqueue.push(t);
+        mem.write_pcr_id(pcr_base, 0);
+        self.recompute_slot_runnable(0);
+    }
+
+    /// Pick the slot with the smallest `live_depth` whose bit is set in
+    /// `mask`. Returns `None` only when `mask == 0` (malformed).
+    pub fn pick_least_depth_slot(&self, mask: u8) -> Option<u8> {
+        if mask == 0 {
+            return None;
+        }
+        (0..HW_THREAD_COUNT as u8)
+            .filter(|i| mask & (1 << i) != 0)
+            .min_by_key(|i| self.slots[*i as usize].live_depth())
+    }
+
+    /// Remove `Exited` entries from `slot_id`'s runqueue, but only when the
+    /// runqueue is deep enough that compaction is worthwhile. Because
+    /// `swap_remove` shifts indices, this is the only legal way to drop
+    /// entries — and it can invalidate outstanding `ThreadRef`s to the
+    /// affected slot. Callers are responsible for ensuring no live waiter
+    /// lists hold refs into exited entries (they don't, because waiter
+    /// wakeup always removes the ref and sets state to Ready before the
+    /// thread can exit again).
+    fn prune_exited_if_needed(&mut self, slot_id: u8) {
+        let slot = &mut self.slots[slot_id as usize];
+        if slot.runqueue.len() < PRUNE_DEPTH_THRESHOLD {
+            return;
+        }
+        slot.runqueue
+            .retain(|t| !matches!(t.state, HwState::Exited(_)));
+        // running_idx may now be stale. Since we only prune at spawn time
+        // (not mid-round), and round boundaries re-pick running_idx via
+        // begin_slot_visit, clearing is safe.
+        slot.running_idx = None;
+        self.recompute_slot_runnable(slot_id);
+    }
+
+    // ----- Round scheduling -----
+
+    /// Axis 2: emit slot ids with at least one runnable thread, starting
+    /// from `rotation_cursor` and cycling forward. `non_empty_runnable` is
+    /// the fast path — zero bits mean no slot has work and the caller
+    /// falls through to `advance_to_next_wake`.
+    pub fn round_schedule(&mut self) -> Vec<u8> {
+        if self.non_empty_runnable == 0 {
+            return Vec::new();
+        }
+        let start = self.rotation_cursor as usize;
+        let mut out: Vec<u8> = Vec::with_capacity(HW_THREAD_COUNT);
+        for off in 0..HW_THREAD_COUNT {
+            let i = (start + off) % HW_THREAD_COUNT;
+            if self.non_empty_runnable & (1 << i) != 0 {
+                out.push(i as u8);
+            }
+        }
+        // Seeded mode layers a deterministic shuffle on top of the
+        // already-filtered list. Same spawn/wake sequence + same seed ⇒
+        // same schedule (invariant preserved from pre-Axis-1).
+        if let OrderMode::Seeded { .. } = self.order {
+            for i in (1..out.len()).rev() {
+                self.rng_state ^= self.rng_state << 13;
+                self.rng_state ^= self.rng_state >> 7;
+                self.rng_state ^= self.rng_state << 17;
+                let j = (self.rng_state as usize) % (i + 1);
+                out.swap(i, j);
+            }
+        }
+        self.rotation_cursor = ((start + 1) % HW_THREAD_COUNT) as u8;
+        out
+    }
+
+    pub fn begin_round(&mut self) {
+        self.round_count += 1;
+    }
+
+    /// Called by the step loop at the top of each per-slot visit. Picks the
+    /// highest-priority Ready thread on the slot, sets `running_idx`, and
+    /// stashes `self.current` so exports can reach it.
+    pub fn begin_slot_visit(&mut self, hw_id: u8) {
+        let slot = &mut self.slots[hw_id as usize];
+        slot.running_idx = slot.pick_runnable();
+        self.current = slot
+            .running_idx
+            .map(|idx| ThreadRef::new(hw_id, idx as u16));
+    }
+
+    /// Clear `current` at the end of each per-slot visit.
+    pub fn end_slot_visit(&mut self) {
+        self.current = None;
+    }
+
+    /// Axis 3: decrement the currently-running thread's instruction
+    /// quantum. On reach-zero, reload to `QUANTUM_DEFAULT` and rotate
+    /// `running_idx` to the next Ready thread on this slot that sits in
+    /// the same priority tier (hand-off preserves priority ordering).
+    /// The flip is observed by the *next* round's `begin_slot_visit` —
+    /// the step that just completed has already returned, so there's no
+    /// mid-instruction preemption hazard.
+    ///
+    /// Returns `true` if a rotation occurred (purely informational;
+    /// callers don't need to act on it).
+    pub fn decrement_quantum(&mut self) -> bool {
+        let Some(r) = self.current else { return false; };
+        let slot = &mut self.slots[r.hw_id as usize];
+        let Some(t) = slot.runqueue.get_mut(r.idx as usize) else {
+            return false;
+        };
+        if t.quantum_remaining > 0 {
+            t.quantum_remaining -= 1;
+        }
+        if t.quantum_remaining != 0 {
+            return false;
+        }
+        let my_pri = t.priority;
+        t.quantum_remaining = QUANTUM_DEFAULT;
+        // Scan the rest of the runqueue for a same-priority Ready peer.
+        // Priority-higher peers are already going to win the next
+        // `pick_runnable` on this slot, so we only need to find an *equal*
+        // priority peer to enforce fair rotation within the tier.
+        let len = slot.runqueue.len();
+        if len < 2 {
+            return false;
+        }
+        let start = (r.idx as usize + 1) % len;
+        for off in 0..len {
+            let i = (start + off) % len;
+            if i == r.idx as usize {
+                continue;
+            }
+            let cand = &slot.runqueue[i];
+            if cand.priority == my_pri && matches!(cand.state, HwState::Ready) {
+                slot.running_idx = Some(i);
+                self.current = Some(ThreadRef::new(r.hw_id, i as u16));
+                return true;
+            }
+        }
+        false
+    }
+
+    // ----- Park / wake / exit -----
+
+    pub fn park_current(&mut self, reason: BlockReason) {
+        let r = self
+            .current
+            .expect("park_current called outside a step");
+        let deadline = match &reason {
+            BlockReason::WaitAny { deadline, .. } | BlockReason::WaitAll { deadline, .. } => {
+                *deadline
+            }
+            BlockReason::DelayUntil(d) => Some(*d),
+            _ => None,
+        };
+        if let Some(d) = deadline {
+            self.timed_waits.push((d, r));
+            self.timed_waits.sort_by_key(|&(d, _)| d);
+        }
+        self.thread_mut(r).state = HwState::Blocked(reason);
+        self.recompute_slot_runnable(r.hw_id);
+    }
+
+    /// Wake a specific thread (must be Blocked or ServicingIrq). Silently
+    /// no-ops on out-of-bounds refs — waiter lists are positional and may
+    /// outlive their target after a slot compaction; in debug builds we
+    /// warn so regressions of this class surface during development.
+    pub fn wake_ref(&mut self, r: ThreadRef) {
+        let Some(slot) = self.slots.get_mut(r.hw_id as usize) else {
+            debug_assert!(false, "wake_ref: hw_id out of bounds: {:?}", r);
+            return;
+        };
+        let Some(t) = slot.runqueue.get_mut(r.idx as usize) else {
+            // Stale waiter ref — expected under normal operation when a
+            // waiter was enqueued from a test fixture or survived a slot
+            // compaction. Warn in debug builds.
+            #[cfg(debug_assertions)]
+            tracing::debug!("wake_ref: idx out of bounds: {:?}", r);
+            return;
+        };
+        match &t.state {
+            HwState::Blocked(_) | HwState::ServicingIrq(_) => {}
+            _ => return,
+        }
+        t.state = HwState::Ready;
+        t.quantum_remaining = QUANTUM_DEFAULT;
+        self.timed_waits.retain(|&(_, tr)| tr != r);
+        self.recompute_slot_runnable(r.hw_id);
+    }
+
+    /// Axis-4-friendly variant: look up the thread holding `handle` and wake it.
+    pub fn wake_by_handle(&mut self, handle: u32) -> Option<ThreadRef> {
+        let r = self.find_by_handle(handle)?;
+        self.wake_ref(r);
+        Some(r)
+    }
+
+    /// Decrement suspend count on target; if it reaches 0, unblock.
+    /// Returns previous count.
+    pub fn resume_ref(&mut self, r: ThreadRef) -> u32 {
+        let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize];
+        let prev = t.suspend_count;
+        if t.suspend_count > 0 {
+            t.suspend_count -= 1;
+        }
+        if t.suspend_count == 0 && matches!(t.state, HwState::Blocked(BlockReason::Suspended)) {
+            t.state = HwState::Ready;
+            t.quantum_remaining = QUANTUM_DEFAULT;
+        }
+        self.recompute_slot_runnable(r.hw_id);
+        prev
+    }
+
+    pub fn suspend_ref(&mut self, r: ThreadRef) -> u32 {
+        let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize];
+        let prev = t.suspend_count;
+        t.suspend_count += 1;
+        if matches!(t.state, HwState::Ready) {
+            t.state = HwState::Blocked(BlockReason::Suspended);
+        }
+        self.recompute_slot_runnable(r.hw_id);
+        prev
+    }
+
+    /// Set base priority; returns prior value.
+    pub fn set_priority_ref(&mut self, r: ThreadRef, priority: i32) -> i32 {
+        let t = self.thread_mut(r);
+        let prev = t.priority;
+        t.priority = priority;
+        prev
+    }
+
+    pub fn priority_ref(&self, r: ThreadRef) -> i32 {
+        self.thread(r).priority
+    }
+
+    /// Axis 5: `KeSetIdealProcessor` — store the hint (does NOT migrate
+    /// a live thread; purely advisory for subsequent wake decisions,
+    /// which our cooperative scheduler doesn't currently consult — and
+    /// as spawn placement for any newly-created sibling threads).
+    /// Returns previous ideal (or 0xFF if unset).
+    pub fn set_ideal_ref(&mut self, r: ThreadRef, ideal: u8) -> u8 {
+        let t = self.thread_mut(r);
+        let prev = t.ideal_processor.unwrap_or(0xFF);
+        t.ideal_processor = Some(ideal);
+        prev
+    }
+
+    pub fn ideal_ref(&self, r: ThreadRef) -> Option<u8> {
+        self.thread(r).ideal_processor
+    }
+
+    /// Axis 4: Set the affinity mask on `r` and migrate between slot
+    /// runqueues if the current slot is no longer allowed by the mask.
+    /// Returns `(old_mask, new_ref, migration_info)`. `migration_info` is
+    /// `None` when no migration happened, `Some((src_promoted_old_idx,
+    /// src_promoted_new_idx))` when `swap_remove` moved a peer into the
+    /// migrated thread's slot — the caller must walk external waiter
+    /// containers and retarget any ref matching the promoted-old slot.
+    ///
+    /// `mask == 0` normalizes to `0xFF` (Canary parity: early Xbox code
+    /// sometimes passes 0 meaning "any").
+    pub fn set_affinity_ref<W: PcrWriter>(
+        &mut self,
+        r: ThreadRef,
+        new_mask: u8,
+        mem: &mut W,
+    ) -> (u8, ThreadRef, Option<MigrationFixup>) {
+        let old_mask = self.thread(r).affinity_mask;
+        let effective = if new_mask == 0 { 0xFF } else { new_mask };
+        self.thread_mut(r).affinity_mask = new_mask;
+        // Current slot still allowed → no migration.
+        if effective & (1 << r.hw_id) != 0 {
+            return (old_mask, r, None);
+        }
+        // Pick target = least-depth allowed slot.
+        let target = self
+            .pick_least_depth_slot(effective)
+            .expect("set_affinity_ref: effective mask must allow some slot");
+        // Physically move the GuestThread struct.
+        let src_len_before = self.slots[r.hw_id as usize].runqueue.len();
+        let promoted_old_idx = (src_len_before - 1) as u16;
+        let mut thread = self.slots[r.hw_id as usize]
+            .runqueue
+            .swap_remove(r.idx as usize);
+        mem.write_pcr_id(thread.pcr_base, target);
+        // M3.7 — keep ctx.hw_id in sync with the thread's new slot so
+        // table-routed lwarx/stwcx use the correct discriminator.
+        thread.ctx.hw_id = target;
+        self.slots[target as usize].runqueue.push(thread);
+        let new_idx = (self.slots[target as usize].runqueue.len() - 1) as u16;
+        let new_ref = ThreadRef::new(target, new_idx);
+        // Timed waits: rewrite r → new_ref if present.
+        for entry in self.timed_waits.iter_mut() {
+            if entry.1 == r {
+                entry.1 = new_ref;
+            } else if entry.1 == ThreadRef::new(r.hw_id, promoted_old_idx) {
+                entry.1 = ThreadRef::new(r.hw_id, r.idx);
+            }
+        }
+        // Running index defense: if src slot's running_idx pointed at the
+        // migrated entry or the promoted peer, clear / retarget.
+        let src_slot = &mut self.slots[r.hw_id as usize];
+        if src_slot.running_idx == Some(r.idx as usize) {
+            src_slot.running_idx = None;
+        } else if src_slot.running_idx == Some(promoted_old_idx as usize) {
+            src_slot.running_idx = Some(r.idx as usize);
+        }
+        self.recompute_slot_runnable(r.hw_id);
+        self.recompute_slot_runnable(target);
+        // If the migrating thread was the currently-running one (self-
+        // migrating export call), update `self.current` so `call_export`'s
+        // stashed ThreadRef still resolves on its swap-back path.
+        if self.current == Some(r) {
+            self.current = Some(new_ref);
+        } else if self.current == Some(ThreadRef::new(r.hw_id, promoted_old_idx))
+        {
+            self.current = Some(ThreadRef::new(r.hw_id, r.idx));
+        }
+        // Emit promotion info only if the last-index of the source
+        // wasn't the migrating thread itself (otherwise swap_remove was
+        // a plain pop and no peer got promoted).
+        let fixup = if promoted_old_idx != r.idx {
+            Some(MigrationFixup {
+                source_hw: r.hw_id,
+                promoted_old_idx,
+                promoted_new_idx: r.idx,
+                migrated_old_ref: r,
+                migrated_new_ref: new_ref,
+            })
+        } else {
+            Some(MigrationFixup {
+                source_hw: r.hw_id,
+                promoted_old_idx: r.idx, // no-op promotion
+                promoted_new_idx: r.idx,
+                migrated_old_ref: r,
+                migrated_new_ref: new_ref,
+            })
+        };
+        (old_mask, new_ref, fixup)
+    }
+
+    /// Mark the current thread exited. Returns (hw_id, tid, handle) of
+    /// the exiting thread so the caller can wake joiners.
+    pub fn exit_current(&mut self, exit_code: u32) -> (u8, Option<u32>, Option<u32>) {
+        let r = self.current.expect("exit_current outside step");
+        let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize];
+        let tid = Some(t.tid);
+        let handle = t.thread_handle;
+        t.state = HwState::Exited(exit_code);
+        self.timed_waits.retain(|&(_, tr)| tr != r);
+        self.recompute_slot_runnable(r.hw_id);
+        (r.hw_id, tid, handle)
+    }
+
+    // ----- TLS -----
+
+    /// Allocate a new global TLS slot index. All live threads' `tls_values`
+    /// vecs grow to match.
+    pub fn tls_alloc(&mut self) -> u32 {
+        let idx = self.tls_slot_count as u32;
+        self.tls_slot_count += 1;
+        for slot in self.slots.iter_mut() {
+            for t in slot.runqueue.iter_mut() {
+                if t.tls_values.len() < self.tls_slot_count {
+                    t.tls_values.resize(self.tls_slot_count, 0);
+                }
+            }
+        }
+        idx
+    }
+
+    /// Compat: caller asks for a specific capacity (e.g. spawn's
+    /// `tls_slot_count`). Grows every thread's tls_values up to `count`.
+    pub fn tls_grow_to(&mut self, count: usize) {
+        if count > self.tls_slot_count {
+            self.tls_slot_count = count;
+        }
+        for slot in self.slots.iter_mut() {
+            for t in slot.runqueue.iter_mut() {
+                if t.tls_values.len() < count {
+                    t.tls_values.resize(count, 0);
+                }
+            }
+        }
+    }
+
+    pub fn tls_get(&self, slot_idx: u32) -> u64 {
+        let r = match self.current {
+            Some(r) => r,
+            None => return 0,
+        };
+        self.slots[r.hw_id as usize].runqueue[r.idx as usize]
+            .tls_values
+            .get(slot_idx as usize)
+            .copied()
+            .unwrap_or(0)
+    }
+
+    pub fn tls_set(&mut self, slot_idx: u32, value: u64) {
+        let Some(r) = self.current else { return; };
+        let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize];
+        let i = slot_idx as usize;
+        if t.tls_values.len() <= i {
+            t.tls_values.resize(i + 1, 0);
+        }
+        t.tls_values[i] = value;
+    }
+
+    // ----- Time advance / deadlock -----
+
+    /// Peek the earliest pending timed-wait deadline without popping. The
+    /// kernel uses this together with `KernelState::earliest_timer_deadline`
+    /// to compute the next global time step in the scheduler round.
+    pub fn earliest_wait_deadline(&self) -> Option<u64> {
+        self.timed_waits.first().map(|&(d, _)| d)
+    }
+
+    /// Move every thread's timebase up to `deadline` (if already past,
+    /// leave it alone). Extracted from the old `advance_to_next_wake`
+    /// body so the kernel can drive time-advance for timer fires in
+    /// addition to thread wakes.
+    pub fn advance_all_timebases_to(&mut self, deadline: u64) {
+        for slot in self.slots.iter_mut() {
+            for t in slot.runqueue.iter_mut() {
+                if t.ctx.timebase < deadline {
+                    t.ctx.timebase = deadline;
+                }
+            }
+        }
+    }
+
+    /// Fast-forward the timebase to the earliest pending timed wait and
+    /// wake that sleeper. Used when a round had no Ready threads and no
+    /// timer fires closer than the earliest wait. Returns the woken
+    /// thread's `ThreadRef` + the `BlockReason` it was parked with, so
+    /// the caller can stamp `STATUS_TIMEOUT` and scrub stale waiter-list
+    /// entries via `KernelState::handle_timeout_wake`. `None` means the
+    /// timed-waits queue was empty.
+    pub fn advance_to_next_wake(&mut self) -> Option<(ThreadRef, BlockReason)> {
+        let (deadline, r) = *self.timed_waits.first()?;
+        self.advance_all_timebases_to(deadline);
+        self.timed_waits.remove(0);
+        let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize];
+        let reason = match std::mem::replace(&mut t.state, HwState::Ready) {
+            HwState::Blocked(reason) | HwState::ServicingIrq(reason) => reason,
+            other => {
+                // Defensive: the timed_waits entry should only ever track a
+                // Blocked or ServicingIrq thread, but if some path already
+                // woke this ref we keep going with a stand-in reason so the
+                // caller can't miss a timeout-wake follow-up.
+                tracing::debug!(
+                    hw_id = r.hw_id,
+                    idx = r.idx,
+                    state = ?other,
+                    "advance_to_next_wake: unexpected prior state (ignored)"
+                );
+                BlockReason::Suspended
+            }
+        };
+        t.quantum_remaining = QUANTUM_DEFAULT;
+        self.recompute_slot_runnable(r.hw_id);
+        tracing::info!(
+            "scheduler: advanced to deadline {} waking hw={} idx={}",
+            deadline,
+            r.hw_id,
+            r.idx
+        );
+        Some((r, reason))
+    }
+
+    /// Pop the earliest timed wait only if its deadline is `<= target`.
+    /// Used by the kernel-driven scheduler loop to consume a just-ripe
+    /// thread wake after timers fired to that same `target`. If the
+    /// earliest entry has a later deadline (some other event drove
+    /// advance), returns `None` and leaves the entry in place.
+    pub fn advance_to_next_wake_if_due(
+        &mut self,
+        target: u64,
+    ) -> Option<(ThreadRef, BlockReason)> {
+        let (d, _) = *self.timed_waits.first()?;
+        if d > target {
+            return None;
+        }
+        self.advance_to_next_wake()
+    }
+
+    /// Does any thread across any slot exist in a state other than
+    /// Exited/Idle?
+    pub fn has_live_thread(&self) -> bool {
+        self.slots.iter().any(|slot| {
+            slot.runqueue.iter().any(|t| {
+                matches!(
+                    t.state,
+                    HwState::Ready | HwState::Blocked(_) | HwState::ServicingIrq(_)
+                )
+            })
+        })
+    }
+
+    /// Snapshot thread states for diagnostic logging. One entry per live
+    /// guest thread (Exited are included so post-mortem can see exit codes).
+    pub fn diagnostic_snapshot(&self) -> Vec<(ThreadRef, Option<u32>, HwState)> {
+        let mut out = Vec::new();
+        for (hw_id, slot) in self.slots.iter().enumerate() {
+            for (idx, t) in slot.runqueue.iter().enumerate() {
+                out.push((
+                    ThreadRef::new(hw_id as u8, idx as u16),
+                    Some(t.tid),
+                    t.state.clone(),
+                ));
+            }
+        }
+        out
+    }
+
+    /// Force-wake every Blocked waiter (WaitAny/WaitAll/CriticalSection)
+    /// with STATUS_TIMEOUT. Caller writes the status code into
+    /// `ctx_mut_ref(r).gpr[3]`. Returns the refs that were woken.
+    pub fn unblock_on_deadlock(&mut self) -> Vec<ThreadRef> {
+        let mut woken = Vec::new();
+        for (hw_id, slot) in self.slots.iter_mut().enumerate() {
+            for (idx, t) in slot.runqueue.iter_mut().enumerate() {
+                if matches!(
+                    t.state,
+                    HwState::Blocked(BlockReason::WaitAny { .. })
+                        | HwState::Blocked(BlockReason::WaitAll { .. })
+                        | HwState::Blocked(BlockReason::CriticalSection(_))
+                ) {
+                    t.state = HwState::Ready;
+                    t.quantum_remaining = QUANTUM_DEFAULT;
+                    woken.push(ThreadRef::new(hw_id as u8, idx as u16));
+                }
+            }
+        }
+        self.timed_waits.clear();
+        for i in 0..HW_THREAD_COUNT as u8 {
+            self.recompute_slot_runnable(i);
+        }
+        woken
+    }
+}
+
+impl Default for Scheduler {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+// ====== Tests ======
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// No-op PcrWriter for unit tests that don't exercise the guest memory write.
+    #[derive(Default)]
+    struct NullPcr;
+    impl PcrWriter for NullPcr {
+        fn write_pcr_id(&mut self, _pcr_base: u32, _hw_id: u8) {}
+    }
+
+    /// PcrWriter that records every write for assertion.
+    #[derive(Default)]
+    struct RecordingPcr {
+        writes: Vec<(u32, u8)>,
+    }
+    impl PcrWriter for RecordingPcr {
+        fn write_pcr_id(&mut self, pcr_base: u32, hw_id: u8) {
+            self.writes.push((pcr_base, hw_id));
+        }
+    }
+
+    fn mk_scheduler_with_initial() -> Scheduler {
+        let mut s = Scheduler::new();
+        let mut ctx = PpcContext::new();
+        ctx.pc = 0x8200_0000;
+        ctx.gpr[1] = 0x7000_0000;
+        s.install_initial_thread(
+            ctx,
+            0x7000_0000,
+            0x10_0000,
+            0x7FFF_0000,
+            0x7FFE_0000,
+            0x1000,
+            &mut NullPcr,
+        );
+        s
+    }
+
+    fn worker_spawn_params(tid: u32, handle: u32) -> SpawnParams {
+        SpawnParams {
+            entry: 0x8200_1000,
+            start_context: 0xDEAD_BEEF,
+            stack_base: 0x7100_0000 + tid * 0x10_0000,
+            stack_size: 0x10_0000,
+            pcr_base: 0x7FEF_0000 + tid * 0x2000,
+            tls_base: 0x7FEE_0000 + tid * 0x2000,
+            thread_handle: handle,
+            guest_tid: tid,
+            create_suspended: false,
+            is_initial: false,
+            tls_slot_count: 0,
+            affinity_mask: 0xFF,
+            priority: 0,
+            ideal_processor: None,
+        }
+    }
+
+    // ---- preserved from pre-Axis-1 (updated names and params) ----
+
+    #[test]
+    fn spawn_lands_on_least_depth_slot() {
+        // With only slot 0 occupied, the next spawn must go to slot 1
+        // (least depth among 1..5, all zero; 0 < 1).
+        let mut s = mk_scheduler_with_initial();
+        let slot = s
+            .spawn(worker_spawn_params(2, 0x2000), &mut NullPcr)
+            .unwrap();
+        assert_eq!(slot, 1);
+        let thread = &s.slots[1].runqueue[0];
+        assert_eq!(thread.state, HwState::Ready);
+        assert_eq!(thread.ctx.pc, 0x8200_1000);
+        assert_eq!(thread.ctx.gpr[3], 0xDEAD_BEEF);
+    }
+
+    #[test]
+    fn suspended_spawn_stays_blocked_until_resume() {
+        let mut s = mk_scheduler_with_initial();
+        let mut params = worker_spawn_params(2, 0x2000);
+        params.create_suspended = true;
+        let slot = s.spawn(params, &mut NullPcr).unwrap();
+        let r = ThreadRef::new(slot, 0);
+        assert_eq!(
+            s.thread(r).state,
+            HwState::Blocked(BlockReason::Suspended)
+        );
+        assert_eq!(s.thread(r).suspend_count, 1);
+        let prev = s.resume_ref(r);
+        assert_eq!(prev, 1);
+        assert_eq!(s.thread(r).state, HwState::Ready);
+    }
+
+    #[test]
+    fn round_schedule_skips_blocked() {
+        let mut s = mk_scheduler_with_initial();
+        let mut params = worker_spawn_params(2, 0x2000);
+        params.create_suspended = true;
+        s.spawn(params, &mut NullPcr).unwrap();
+        // Initial thread (slot 0) is Ready. Spawned thread (slot 1) is
+        // Suspended. round_schedule should only list slot 0.
+        let order = s.round_schedule();
+        assert_eq!(order, vec![0]);
+    }
+
+    #[test]
+    fn seeded_order_is_deterministic() {
+        let order = OrderMode::Seeded { seed: 42 };
+        let mut s1 = mk_scheduler_with_initial();
+        let mut s2 = mk_scheduler_with_initial();
+        s1.order = order;
+        s1.rng_state = 42;
+        s2.order = order;
+        s2.rng_state = 42;
+        for i in 0..5 {
+            let tid = 2 + i as u32;
+            let _ = s1.spawn(worker_spawn_params(tid, 0x2000 + i * 4), &mut NullPcr);
+            let _ = s2.spawn(worker_spawn_params(tid, 0x2000 + i * 4), &mut NullPcr);
+        }
+        let a = s1.round_schedule();
+        let b = s2.round_schedule();
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn tls_is_per_thread() {
+        let mut s = mk_scheduler_with_initial();
+        s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
+        s.tls_grow_to(4);
+        // Simulate running on slot 0 (initial thread)
+        s.begin_slot_visit(0);
+        s.tls_set(0, 0xAAAA);
+        s.end_slot_visit();
+        // Simulate running on slot 1 (worker)
+        s.begin_slot_visit(1);
+        s.tls_set(0, 0xBBBB);
+        s.end_slot_visit();
+        s.begin_slot_visit(0);
+        assert_eq!(s.tls_get(0), 0xAAAA);
+        s.end_slot_visit();
+        s.begin_slot_visit(1);
+        assert_eq!(s.tls_get(0), 0xBBBB);
+    }
+
+    // ---- new Axis-1 tests ----
+
+    #[test]
+    fn test_two_threads_same_slot_higher_priority_runs_first() {
+        let mut s = mk_scheduler_with_initial();
+        // Force both workers onto slot 0 via affinity.
+        let mut a = worker_spawn_params(2, 0x2000);
+        a.affinity_mask = 0b0000_0001;
+        a.priority = 0;
+        let mut b = worker_spawn_params(3, 0x3000);
+        b.affinity_mask = 0b0000_0001;
+        b.priority = 5;
+        s.spawn(a, &mut NullPcr).unwrap();
+        s.spawn(b, &mut NullPcr).unwrap();
+        // Slot 0 now holds: [main(pri 0), worker2(pri 0), worker3(pri 5)]
+        s.begin_slot_visit(0);
+        let r = s.current.expect("current set");
+        let t = s.thread(r);
+        assert_eq!(t.tid, 3, "worker3 (pri 5) wins the pick");
+        assert_eq!(t.priority, 5);
+        s.end_slot_visit();
+    }
+
+    #[test]
+    fn test_slot_depth_accounting_least_depth_placement() {
+        // Initial thread sits on slot 0 (depth 1, others 0). Spawning 6
+        // more threads with affinity 0xFF should fill slots 1..5 each to
+        // depth 1, then the 7th lands on whichever slot has depth 1
+        // (ties broken by lower index — min_by_key preserves the first
+        // minimum).
+        let mut s = mk_scheduler_with_initial();
+        let mut placements = Vec::new();
+        for i in 0..6 {
+            let tid = 2 + i as u32;
+            let slot = s
+                .spawn(worker_spawn_params(tid, 0x2000 + i * 4), &mut NullPcr)
+                .unwrap();
+            placements.push(slot);
+        }
+        // After 7 total threads (1 initial + 6 workers), one of slots 1..5
+        // carries 2. Since min_by_key picks the *first* minimum at each
+        // step and slot 0 starts at depth 1 (initial), placements should
+        // go: [1, 2, 3, 4, 5, 1] (slot 0 starts at depth 1, others at 0,
+        // so slot 1 wins first with depth 0; once slot 1 has one, slots
+        // 2..5 still have 0, so slot 2 next; etc. On the 6th worker all
+        // slots 1..5 have depth 1, same as slot 0 — min_by_key returns
+        // slot 0? No: we skip the "current depth" comparison... actually
+        // our filter includes slot 0 too since mask=0xFF. Slot 0 has
+        // depth 1, slots 1..5 each have depth 1 after the first 5
+        // workers. The 6th worker sees slots 0..5 all with depth 1 ⇒
+        // min_by_key returns slot 0 (lowest index). So placements =
+        // [1, 2, 3, 4, 5, 0].
+        assert_eq!(placements, vec![1, 2, 3, 4, 5, 0]);
+    }
+
+    #[test]
+    fn test_exited_threads_dont_block_spawn() {
+        let mut s = mk_scheduler_with_initial();
+        // Fill slot 1 to the prune threshold with exited threads.
+        for i in 0..PRUNE_DEPTH_THRESHOLD {
+            let tid = 10 + i as u32;
+            let mut p = worker_spawn_params(tid, 0x4000 + i as u32 * 4);
+            p.affinity_mask = 0b0000_0010; // only slot 1
+            s.spawn(p, &mut NullPcr).unwrap();
+        }
+        assert_eq!(s.slots[1].runqueue.len(), PRUNE_DEPTH_THRESHOLD);
+        // Mark them all Exited.
+        for t in s.slots[1].runqueue.iter_mut() {
+            t.state = HwState::Exited(0);
+        }
+        // Now spawn a fresh thread with affinity = slot 1 only. Should
+        // land successfully (prune kicks in at PRUNE_DEPTH_THRESHOLD).
+        let mut p = worker_spawn_params(99, 0x9000);
+        p.affinity_mask = 0b0000_0010;
+        let slot = s.spawn(p, &mut NullPcr).unwrap();
+        assert_eq!(slot, 1);
+        // Post-prune + push: all-Exited entries gone, fresh thread at idx 0.
+        assert_eq!(s.slots[1].runqueue.len(), 1);
+        assert_eq!(s.slots[1].runqueue[0].tid, 99);
+    }
+
+    #[test]
+    fn test_threadref_survives_spawn() {
+        // Peer spawned into the same slot must not shift an existing
+        // ThreadRef (vec push appends, doesn't reorder).
+        let mut s = mk_scheduler_with_initial();
+        let mut a = worker_spawn_params(2, 0x2000);
+        a.affinity_mask = 0b0000_0010; // slot 1
+        s.spawn(a, &mut NullPcr).unwrap();
+        let r_original = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
+        assert_eq!(s.thread(r_original).tid, 2);
+
+        let mut b = worker_spawn_params(3, 0x3000);
+        b.affinity_mask = 0b0000_0010;
+        s.spawn(b, &mut NullPcr).unwrap();
+        // Original ref still resolves to tid 2.
+        assert_eq!(s.thread(r_original).tid, 2);
+        assert_eq!(s.slots[1].runqueue[1].tid, 3);
+    }
+
+    #[test]
+    fn test_priority_default_zero() {
+        let mut s = mk_scheduler_with_initial();
+        let slot = s
+            .spawn(worker_spawn_params(2, 0x2000), &mut NullPcr)
+            .unwrap();
+        let r = ThreadRef::new(slot, 0);
+        assert_eq!(s.priority_ref(r), 0);
+        let prev = s.set_priority_ref(r, 5);
+        assert_eq!(prev, 0);
+        assert_eq!(s.priority_ref(r), 5);
+    }
+
+    #[test]
+    fn test_spawn_records_pcr_write() {
+        let mut s = mk_scheduler_with_initial();
+        let mut rec = RecordingPcr::default();
+        // install_initial wrote (pcr_base=0x7FFF_0000, hw=0)
+        // spawn will write (pcr_base=0x7FEF_0000 + delta, hw=1)
+        let p = worker_spawn_params(2, 0x2000);
+        let pcr_base = p.pcr_base;
+        let slot = s.spawn(p, &mut rec).unwrap();
+        assert_eq!(rec.writes, vec![(pcr_base, slot)]);
+    }
+
+    #[test]
+    fn test_find_by_tid_returns_threadref() {
+        let mut s = mk_scheduler_with_initial();
+        s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
+        let r = s.find_by_tid(2).expect("spawned tid 2");
+        assert_eq!(r, ThreadRef { hw_id: 1, idx: 0, generation: 0 });
+        assert!(s.find_by_tid(99).is_none());
+    }
+
+    #[test]
+    fn test_find_by_handle_returns_threadref() {
+        let mut s = mk_scheduler_with_initial();
+        s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
+        let r = s.find_by_handle(0x2000).expect("handle 0x2000");
+        assert_eq!(r, ThreadRef { hw_id: 1, idx: 0, generation: 0 });
+    }
+
+    #[test]
+    fn test_exit_current_marks_state_without_removal() {
+        // Exit must NOT Vec::remove — that would invalidate peer
+        // ThreadRefs. State flip + stable positions is the invariant.
+        let mut s = mk_scheduler_with_initial();
+        s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
+        s.begin_slot_visit(0);
+        let r = s.current.expect("current set");
+        let (hw_id, tid, _handle) = s.exit_current(0xABCD);
+        s.end_slot_visit();
+        assert_eq!(hw_id, 0);
+        assert_eq!(tid, Some(INITIAL_GUEST_TID));
+        // Thread still at slot 0 idx 0, now Exited.
+        assert_eq!(s.slots[0].runqueue.len(), 1);
+        assert_eq!(s.slots[0].runqueue[0].state, HwState::Exited(0xABCD));
+        // worker on slot 1 idx 0 is unaffected.
+        assert_eq!(s.slots[1].runqueue[0].tid, 2);
+        let _ = r;
+    }
+
+    // ---- Axis 2: rotation + bitset tests ----
+
+    fn mk_empty_scheduler() -> Scheduler {
+        // For rotation tests we want NO initial thread on slot 0 —
+        // every runnable bit comes from explicit spawns below.
+        Scheduler::new()
+    }
+
+    #[test]
+    fn test_rotation_cursor_advances_per_round() {
+        let mut s = mk_empty_scheduler();
+        // Populate all 6 slots with one Ready thread each.
+        let mut next_tid = 1u32;
+        for hw in 0..6u8 {
+            let mut p = SpawnParams::default();
+            p.guest_tid = next_tid;
+            p.thread_handle = 0x1000 + (next_tid * 4);
+            p.affinity_mask = 1 << hw;
+            p.pcr_base = 0x40000000 + (hw as u32) * 0x1000;
+            s.spawn(p, &mut NullPcr).unwrap();
+            next_tid += 1;
+        }
+        assert_eq!(s.non_empty_runnable, 0b11_1111);
+        let r1 = s.round_schedule();
+        assert_eq!(r1, vec![0, 1, 2, 3, 4, 5]);
+        let r2 = s.round_schedule();
+        assert_eq!(r2, vec![1, 2, 3, 4, 5, 0]);
+        let r3 = s.round_schedule();
+        assert_eq!(r3, vec![2, 3, 4, 5, 0, 1]);
+    }
+
+    #[test]
+    fn test_rotation_skips_empty_slots() {
+        let mut s = mk_empty_scheduler();
+        // Slots [Ready, Ready, empty, Ready, Ready, empty] ⇒ bitset 0b011011.
+        for hw in [0u8, 1, 3, 4] {
+            let mut p = SpawnParams::default();
+            p.guest_tid = (hw + 1) as u32;
+            p.thread_handle = 0x1000 + (hw as u32) * 4;
+            p.affinity_mask = 1 << hw;
+            p.pcr_base = 0x40000000 + (hw as u32) * 0x1000;
+            s.spawn(p, &mut NullPcr).unwrap();
+        }
+        assert_eq!(s.non_empty_runnable, 0b01_1011);
+        let r = s.round_schedule();
+        assert_eq!(r, vec![0, 1, 3, 4], "emits only slots with bits set");
+        let r = s.round_schedule();
+        assert_eq!(r, vec![1, 3, 4, 0], "rotation cursor advances past empties");
+    }
+
+    #[test]
+    fn test_park_toggles_bit_and_wake_restores() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 2;
+        p.thread_handle = 0x2000;
+        p.affinity_mask = 0b0010;
+        p.pcr_base = 0x4000_1000;
+        s.spawn(p, &mut NullPcr).unwrap();
+        assert_eq!(s.non_empty_runnable, 0b0010);
+        // Park the thread: bit 1 should clear.
+        s.begin_slot_visit(1);
+        s.park_current(BlockReason::DelayUntil(1_000_000));
+        s.end_slot_visit();
+        assert_eq!(s.non_empty_runnable, 0, "park clears slot 1's runnable bit");
+        // Wake it: bit 1 restores.
+        let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
+        s.wake_ref(r);
+        assert_eq!(s.non_empty_runnable, 0b0010);
+    }
+
+    #[test]
+    fn test_round_schedule_empty_fastpath() {
+        let mut s = mk_empty_scheduler();
+        // No spawns ⇒ bitset is 0 ⇒ fast return without allocating.
+        assert_eq!(s.non_empty_runnable, 0);
+        let r = s.round_schedule();
+        assert!(r.is_empty());
+        // Cursor must not advance on empty rounds (nothing happened).
+        assert_eq!(s.rotation_cursor, 0);
+    }
+
+    #[test]
+    fn test_rotation_fairness_three_slots_two_threads_each() {
+        let mut s = mk_empty_scheduler();
+        // Slots 0, 2, 4 each hold two Ready threads; 1/3/5 empty.
+        let mut next_tid = 1u32;
+        for hw in [0u8, 2, 4] {
+            for _slot_peer in 0..2 {
+                let mut p = SpawnParams::default();
+                p.guest_tid = next_tid;
+                p.thread_handle = 0x1000 + (next_tid * 4);
+                p.affinity_mask = 1 << hw;
+                p.pcr_base = 0x40000000 + (next_tid * 0x1000);
+                s.spawn(p, &mut NullPcr).unwrap();
+                next_tid += 1;
+            }
+        }
+        assert_eq!(s.non_empty_runnable, 0b01_0101);
+        let r = s.round_schedule();
+        // Three entries per round (one per non-empty slot).
+        assert_eq!(r.len(), 3);
+        assert!(r.contains(&0) && r.contains(&2) && r.contains(&4));
+    }
+
+    // ---- Axis 5: ideal processor + initial placement tests ----
+
+    #[test]
+    fn test_spawn_with_ideal_processor_lands_on_ideal_slot() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 1;
+        p.thread_handle = 0x1000;
+        p.affinity_mask = 0xFF;
+        p.ideal_processor = Some(3);
+        p.pcr_base = 0x4000_0000;
+        let slot = s.spawn(p, &mut NullPcr).unwrap();
+        assert_eq!(slot, 3, "ideal=3 + mask=0xFF lands on slot 3");
+    }
+
+    #[test]
+    fn test_spawn_with_ideal_outside_mask_falls_back_to_least_depth() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 1;
+        p.thread_handle = 0x1000;
+        p.affinity_mask = 0b0000_0011; // only slots 0, 1
+        p.ideal_processor = Some(5); // outside mask
+        p.pcr_base = 0x4000_0000;
+        let slot = s.spawn(p, &mut NullPcr).unwrap();
+        assert!(slot == 0 || slot == 1, "falls back to mask-allowed least-depth");
+    }
+
+    #[test]
+    fn test_spawn_without_ideal_uses_least_depth() {
+        let mut s = mk_empty_scheduler();
+        // Pre-fill slots 0..3 with one thread each via explicit affinity.
+        let mut next_tid = 1u32;
+        for hw in 0..4u8 {
+            let mut p = SpawnParams::default();
+            p.guest_tid = next_tid;
+            p.thread_handle = 0x1000 + next_tid * 4;
+            p.affinity_mask = 1 << hw;
+            p.pcr_base = 0x4000_0000 + next_tid * 0x1000;
+            s.spawn(p, &mut NullPcr).unwrap();
+            next_tid += 1;
+        }
+        // Slots 0..3 have depth 1; 4, 5 have depth 0.
+        let mut p = SpawnParams::default();
+        p.guest_tid = next_tid;
+        p.thread_handle = 0x1000 + next_tid * 4;
+        p.affinity_mask = 0xFF;
+        p.ideal_processor = None;
+        p.pcr_base = 0x4000_0000 + next_tid * 0x1000;
+        let slot = s.spawn(p, &mut NullPcr).unwrap();
+        assert!(slot == 4 || slot == 5, "least-depth wins; slot={}", slot);
+    }
+
+    #[test]
+    fn test_set_ideal_ref_roundtrip() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 1;
+        p.thread_handle = 0x1000;
+        p.affinity_mask = 0b0000_0001;
+        p.pcr_base = 0x4000_0000;
+        s.spawn(p, &mut NullPcr).unwrap();
+        let r = ThreadRef { hw_id: 0, idx: 0, generation: 0 };
+        assert_eq!(s.ideal_ref(r), None, "no ideal at spawn");
+        let prev = s.set_ideal_ref(r, 4);
+        assert_eq!(prev, 0xFF, "unset previous returns 0xFF sentinel");
+        assert_eq!(s.ideal_ref(r), Some(4));
+        let prev = s.set_ideal_ref(r, 2);
+        assert_eq!(prev, 4);
+        assert_eq!(s.ideal_ref(r), Some(2));
+    }
+
+    // ---- Axis 4: affinity migration tests ----
+
+    #[test]
+    fn test_affinity_change_migrates_to_new_slot() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 1;
+        p.thread_handle = 0x1000;
+        p.affinity_mask = 0xFF;
+        p.pcr_base = 0x4000_0000;
+        s.spawn(p, &mut NullPcr).unwrap();
+        // Landed on slot 0 (least-depth + lowest-index tiebreak).
+        assert_eq!(s.slots[0].runqueue.len(), 1);
+        let r = ThreadRef { hw_id: 0, idx: 0, generation: 0 };
+        // Restrict to slot 2 only.
+        let (old, new_ref, _fx) = s.set_affinity_ref(r, 0b0000_0100, &mut NullPcr);
+        assert_eq!(old, 0xFF);
+        assert_eq!(new_ref, ThreadRef { hw_id: 2, idx: 0, generation: 0 });
+        assert!(s.slots[0].runqueue.is_empty());
+        assert_eq!(s.slots[2].runqueue.len(), 1);
+    }
+
+    #[test]
+    fn test_affinity_change_stays_put_when_current_allowed() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 1;
+        p.thread_handle = 0x1000;
+        p.affinity_mask = 0b0000_1000;
+        p.pcr_base = 0x4000_0000;
+        s.spawn(p, &mut NullPcr).unwrap();
+        // Landed on slot 3 (only bit set).
+        let r = ThreadRef { hw_id: 3, idx: 0, generation: 0 };
+        assert_eq!(s.thread(r).tid, 1);
+        // Expand mask to 0..3 — slot 3 still allowed, no migration.
+        let (_old, new_ref, _fx) = s.set_affinity_ref(r, 0b0000_1111, &mut NullPcr);
+        assert_eq!(new_ref, r);
+    }
+
+    #[test]
+    fn test_affinity_migration_rewrites_pcr() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 1;
+        p.thread_handle = 0x1000;
+        p.affinity_mask = 0b0000_0010;
+        p.pcr_base = 0x4100_0000;
+        s.spawn(p, &mut NullPcr).unwrap();
+        let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
+        let mut rec = RecordingPcr::default();
+        let (_old, _new, _fx) = s.set_affinity_ref(r, 0b0001_0000, &mut rec);
+        // Migration target = slot 4 (the only bit set).
+        assert_eq!(rec.writes, vec![(0x4100_0000, 4)]);
+    }
+
+    #[test]
+    fn test_affinity_mask_zero_treated_as_any() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 1;
+        p.thread_handle = 0x1000;
+        p.affinity_mask = 0b0000_0100;
+        p.pcr_base = 0x4000_0000;
+        s.spawn(p, &mut NullPcr).unwrap();
+        let r = ThreadRef { hw_id: 2, idx: 0, generation: 0 };
+        // mask=0 normalizes to 0xFF; slot 2 is still allowed → no migration.
+        let (old, new_ref, _fx) = s.set_affinity_ref(r, 0, &mut NullPcr);
+        assert_eq!(old, 0b0000_0100);
+        assert_eq!(new_ref, r);
+        // Verify the stored mask is 0 (we save the raw value) even though
+        // the effective is 0xFF.
+        assert_eq!(s.thread(r).affinity_mask, 0);
+    }
+
+    #[test]
+    fn test_affinity_migration_fixup_retargets_promoted_peer() {
+        // Two threads on slot 0: A (idx 0), B (idx 1). Migrate A to
+        // slot 3 — swap_remove moves B from idx 1 to idx 0. A ref that
+        // previously pointed at B (idx 1) must be retargeted to idx 0.
+        let mut s = mk_empty_scheduler();
+        for tid in [1u32, 2] {
+            let mut p = SpawnParams::default();
+            p.guest_tid = tid;
+            p.thread_handle = 0x1000 + tid * 4;
+            p.affinity_mask = 0b0000_0001;
+            p.pcr_base = 0x4000_0000 + tid * 0x1000;
+            s.spawn(p, &mut NullPcr).unwrap();
+        }
+        let a_ref = ThreadRef { hw_id: 0, idx: 0, generation: 0 };
+        let b_ref_before = ThreadRef { hw_id: 0, idx: 1, generation: 0 };
+        assert_eq!(s.thread(b_ref_before).tid, 2);
+        let (_old, a_new, fx) = s.set_affinity_ref(a_ref, 0b0000_1000, &mut NullPcr);
+        let fx = fx.expect("migration emits fixup");
+        // A now lives on slot 3 idx 0.
+        assert_eq!(a_new, ThreadRef { hw_id: 3, idx: 0, generation: 0 });
+        // Apply fixup to B's old ref → promoted into slot 0 idx 0.
+        let mut stale = b_ref_before;
+        fx.apply(&mut stale);
+        assert_eq!(stale, ThreadRef { hw_id: 0, idx: 0, generation: 0 });
+        assert_eq!(s.thread(stale).tid, 2);
+    }
+    // ---- Axis 3: quantum tests ----
+
+    #[test]
+    fn test_quantum_rotation_within_slot() {
+        let mut s = mk_empty_scheduler();
+        // A and B both on slot 0 at priority 0.
+        for tid in [1u32, 2] {
+            let mut p = SpawnParams::default();
+            p.guest_tid = tid;
+            p.thread_handle = 0x1000 + tid * 4;
+            p.affinity_mask = 0b0001;
+            p.pcr_base = 0x4000_0000 + tid * 0x1000;
+            s.spawn(p, &mut NullPcr).unwrap();
+        }
+        s.begin_slot_visit(0);
+        let first_tid = s.thread(s.current.unwrap()).tid;
+        // Drain the quantum. Each call bar the last returns false.
+        for _ in 0..(QUANTUM_DEFAULT - 1) {
+            assert!(!s.decrement_quantum());
+        }
+        // The final decrement flips running_idx to the peer.
+        assert!(s.decrement_quantum());
+        let second_tid = s.thread(s.current.unwrap()).tid;
+        assert_ne!(first_tid, second_tid, "quantum expiry rotates to peer");
+    }
+
+    #[test]
+    fn test_quantum_does_not_rotate_without_same_priority_peer() {
+        let mut s = mk_empty_scheduler();
+        // A priority 0, B priority 5 — B wins pick_runnable outright, so
+        // quantum expiry on A shouldn't flip to B (priority ordering
+        // handles that next round instead).
+        let mut pa = SpawnParams::default();
+        pa.guest_tid = 1;
+        pa.thread_handle = 0x1000;
+        pa.affinity_mask = 0b0001;
+        pa.pcr_base = 0x4000_0000;
+        pa.priority = 0;
+        s.spawn(pa, &mut NullPcr).unwrap();
+        let mut pb = SpawnParams::default();
+        pb.guest_tid = 2;
+        pb.thread_handle = 0x1004;
+        pb.affinity_mask = 0b0001;
+        pb.pcr_base = 0x4000_1000;
+        pb.priority = 5;
+        s.spawn(pb, &mut NullPcr).unwrap();
+        // Force A to be running (pick_runnable would actually pick B;
+        // drive the test by manually setting current).
+        s.begin_slot_visit(0);
+        // pick_runnable selects the priority-5 thread (tid=2) because max_by_key
+        // returns highest priority. Set running to A (tid=1, idx=0) manually.
+        s.slots[0].running_idx = Some(0);
+        s.current = Some(ThreadRef { hw_id: 0, idx: 0, generation: 0 });
+        // Drain A's quantum; should reload to DEFAULT but not rotate
+        // (B is higher priority, not equal).
+        for _ in 0..QUANTUM_DEFAULT {
+            let _ = s.decrement_quantum();
+        }
+        let t = s.thread(s.current.unwrap());
+        assert_eq!(t.tid, 1, "stays on A; B has higher priority, not equal");
+        assert_eq!(t.quantum_remaining, QUANTUM_DEFAULT, "quantum reloaded");
+    }
+
+    #[test]
+    fn test_cooperative_yield_does_not_need_quantum() {
+        let mut s = mk_empty_scheduler();
+        for tid in [1u32, 2] {
+            let mut p = SpawnParams::default();
+            p.guest_tid = tid;
+            p.thread_handle = 0x1000 + tid * 4;
+            p.affinity_mask = 0b0001;
+            p.pcr_base = 0x4000_0000 + tid * 0x1000;
+            s.spawn(p, &mut NullPcr).unwrap();
+        }
+        s.begin_slot_visit(0);
+        let first_tid = s.thread(s.current.unwrap()).tid;
+        // Park via cooperative yield.
+        s.park_current(BlockReason::DelayUntil(1_000_000));
+        s.end_slot_visit();
+        // Next round: pick_runnable skips the Blocked one, so the other
+        // thread is selected.
+        s.begin_slot_visit(0);
+        let next_tid = s.thread(s.current.unwrap()).tid;
+        assert_ne!(first_tid, next_tid, "cooperative park switches thread");
+    }
+
+    #[test]
+    fn test_wake_ref_resets_quantum() {
+        let mut s = mk_empty_scheduler();
+        let mut p = SpawnParams::default();
+        p.guest_tid = 2;
+        p.thread_handle = 0x2000;
+        p.affinity_mask = 0b0010;
+        p.pcr_base = 0x4000_1000;
+        s.spawn(p, &mut NullPcr).unwrap();
+        let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
+        // Park, poke quantum to 1, wake ⇒ quantum back to DEFAULT.
+        s.thread_mut(r).state = HwState::Blocked(BlockReason::WaitAny {
+            handles: vec![0xDEAD],
+            deadline: None,
+        });
+        s.thread_mut(r).quantum_remaining = 1;
+        s.wake_ref(r);
+        assert_eq!(s.thread(r).quantum_remaining, QUANTUM_DEFAULT);
+    }
+
+    #[test]
+    fn test_wake_ref_restores_ready_and_quantum() {
+        let mut s = mk_scheduler_with_initial();
+        s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
+        let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
+        // Park then wake.
+        s.thread_mut(r).state = HwState::Blocked(BlockReason::WaitAny {
+            handles: vec![0x1234],
+            deadline: None,
+        });
+        s.thread_mut(r).quantum_remaining = 1;
+        s.wake_ref(r);
+        assert_eq!(s.thread(r).state, HwState::Ready);
+        assert_eq!(s.thread(r).quantum_remaining, QUANTUM_DEFAULT);
+    }
+}
diff --git a/crates/xenia-cpu/src/trap.rs b/crates/xenia-cpu/src/trap.rs
new file mode 100644
index 0000000..5889b71
--- /dev/null
+++ b/crates/xenia-cpu/src/trap.rs
@@ -0,0 +1,95 @@
+//! TO-field evaluation for `tw`, `twi`, `td`, `tdi`.
+//!
+//! The TO field (5 bits) encodes which comparison outcomes trigger a trap:
+//!
+//! | bit | condition |
+//! |-----|-----------|
+//! | 0   | a <  b (signed)   |
+//! | 1   | a >  b (signed)   |
+//! | 2   | a == b            |
+//! | 3   | a <  b (unsigned) |
+//! | 4   | a >  b (unsigned) |
+//!
+//! The bit numbering matches PowerISA ("MSB is bit 0"): TO[0] corresponds to
+//! the high bit of the 5-bit field, i.e. (to >> 4) & 1.
+//!
+//! `tw` / `twi` compare the low 32 bits of the operands (sign-extended back to
+//! 64 for the signed comparison); `td` / `tdi` compare the full 64 bits.
+
+#[derive(Clone, Copy, Debug)]
+pub enum TrapWidth {
+    Word,        // tw, twi: 32-bit
+    Doubleword,  // td, tdi: 64-bit
+}
+
+const TO_SLT: u32 = 1 << 4; // a < b  signed
+const TO_SGT: u32 = 1 << 3; // a > b  signed
+const TO_EQ:  u32 = 1 << 2; // a == b
+const TO_ULT: u32 = 1 << 1; // a < b  unsigned
+const TO_UGT: u32 = 1 << 0; // a > b  unsigned
+
+/// Returns true when the trap should fire.
+pub fn evaluate(to: u32, a: u64, b: u64, width: TrapWidth) -> bool {
+    let (sa, sb, ua, ub): (i64, i64, u64, u64) = match width {
+        TrapWidth::Word => (
+            a as i32 as i64,
+            b as i32 as i64,
+            a as u32 as u64,
+            b as u32 as u64,
+        ),
+        TrapWidth::Doubleword => (a as i64, b as i64, a, b),
+    };
+
+    if (to & TO_SLT) != 0 && sa <  sb { return true; }
+    if (to & TO_SGT) != 0 && sa >  sb { return true; }
+    if (to & TO_EQ)  != 0 && ua == ub { return true; }
+    if (to & TO_ULT) != 0 && ua <  ub { return true; }
+    if (to & TO_UGT) != 0 && ua >  ub { return true; }
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn to_zero_never_traps() {
+        assert!(!evaluate(0, 0, 0, TrapWidth::Doubleword));
+        assert!(!evaluate(0, 5, 3, TrapWidth::Doubleword));
+        assert!(!evaluate(0, !0, 0, TrapWidth::Doubleword));
+    }
+
+    #[test]
+    fn to_31_always_traps_when_any_condition_holds() {
+        // 31 = 0b11111 = all conditions enabled
+        assert!(evaluate(31, 1, 2, TrapWidth::Doubleword)); // slt+ult
+        assert!(evaluate(31, 2, 1, TrapWidth::Doubleword)); // sgt+ugt
+        assert!(evaluate(31, 7, 7, TrapWidth::Doubleword)); // eq
+    }
+
+    #[test]
+    fn to_eq_only() {
+        // TO[2] = 0b00100 = 4
+        assert!(evaluate(4, 5, 5, TrapWidth::Doubleword));
+        assert!(!evaluate(4, 5, 6, TrapWidth::Doubleword));
+    }
+
+    #[test]
+    fn to_signed_vs_unsigned_on_negative() {
+        // a=-1 (as u64 = all-ones). TO[0]=slt enabled = 0b10000 = 16
+        // Signed: -1 < 0  → true
+        let neg1 = (-1i64) as u64;
+        assert!(evaluate(16, neg1, 0, TrapWidth::Doubleword));
+        // TO[3]=ult enabled = 0b00010 = 2 → unsigned: all-ones < 0 is false
+        assert!(!evaluate(2, neg1, 0, TrapWidth::Doubleword));
+    }
+
+    #[test]
+    fn word_width_ignores_high_32_bits() {
+        // a's low 32 = 1, high 32 = different; b = 1. With TO=eq, should trap.
+        let a = 0xDEAD_BEEF_0000_0001u64;
+        assert!(evaluate(4, a, 1, TrapWidth::Word));
+        // In doubleword, different.
+        assert!(!evaluate(4, a, 1, TrapWidth::Doubleword));
+    }
+}
diff --git a/crates/xenia-cpu/src/vmx.rs b/crates/xenia-cpu/src/vmx.rs
new file mode 100644
index 0000000..5650a34
--- /dev/null
+++ b/crates/xenia-cpu/src/vmx.rs
@@ -0,0 +1,918 @@
+//! VMX / AltiVec helper routines shared by the interpreter's 150+ vector
+//! opcode handlers.
+//!
+//! Big-endian lane indexing throughout: `Vec128::bytes[0]` is the most
+//! significant byte, which corresponds to PowerPC lane 0. Operations that
+//! care about "even" vs "odd" lanes follow the PPC convention (lane 0 = most
+//! significant = "even" for multiply-even/odd purposes).
+
+use xenia_memory::MemoryAccess;
+use xenia_types::Vec128;
+
+// ─── Lane accessors ────────────────────────────────────────────────────────
+
+#[inline] pub fn as_i8x16(v: Vec128) -> [i8; 16] {
+    let b = v.as_bytes();
+    let mut r = [0i8; 16];
+    for i in 0..16 { r[i] = b[i] as i8; }
+    r
+}
+
+#[inline] pub fn as_i16x8(v: Vec128) -> [i16; 8] {
+    let u = v.as_u16x8();
+    [u[0] as i16, u[1] as i16, u[2] as i16, u[3] as i16,
+     u[4] as i16, u[5] as i16, u[6] as i16, u[7] as i16]
+}
+
+#[inline] pub fn as_i32x4(v: Vec128) -> [i32; 4] {
+    let u = v.as_u32x4();
+    [u[0] as i32, u[1] as i32, u[2] as i32, u[3] as i32]
+}
+
+#[inline] pub fn from_i8x16(r: [i8; 16]) -> Vec128 {
+    let mut b = [0u8; 16];
+    for i in 0..16 { b[i] = r[i] as u8; }
+    Vec128::from_bytes(b)
+}
+
+#[inline] pub fn from_i16x8(r: [i16; 8]) -> Vec128 {
+    Vec128::from_u16x8_array([
+        r[0] as u16, r[1] as u16, r[2] as u16, r[3] as u16,
+        r[4] as u16, r[5] as u16, r[6] as u16, r[7] as u16,
+    ])
+}
+
+#[inline] pub fn from_i32x4(r: [i32; 4]) -> Vec128 {
+    Vec128::from_u32x4_array([r[0] as u32, r[1] as u32, r[2] as u32, r[3] as u32])
+}
+
+// ─── Saturation helpers ────────────────────────────────────────────────────
+// Each returns (clamped_value, saturated_flag). Handlers OR the flags together
+// and call `ctx.set_vscr_sat(true)` once per instruction.
+
+#[inline] pub fn sat_add_u8(a: u8, b: u8) -> (u8, bool) {
+    let s = a as u16 + b as u16;
+    if s > u8::MAX as u16 { (u8::MAX, true) } else { (s as u8, false) }
+}
+#[inline] pub fn sat_sub_u8(a: u8, b: u8) -> (u8, bool) {
+    if a >= b { (a - b, false) } else { (0, true) }
+}
+#[inline] pub fn sat_add_i8(a: i8, b: i8) -> (i8, bool) {
+    let s = a as i16 + b as i16;
+    if s > i8::MAX as i16 { (i8::MAX, true) }
+    else if s < i8::MIN as i16 { (i8::MIN, true) }
+    else { (s as i8, false) }
+}
+#[inline] pub fn sat_sub_i8(a: i8, b: i8) -> (i8, bool) {
+    let s = a as i16 - b as i16;
+    if s > i8::MAX as i16 { (i8::MAX, true) }
+    else if s < i8::MIN as i16 { (i8::MIN, true) }
+    else { (s as i8, false) }
+}
+
+#[inline] pub fn sat_add_u16(a: u16, b: u16) -> (u16, bool) {
+    let s = a as u32 + b as u32;
+    if s > u16::MAX as u32 { (u16::MAX, true) } else { (s as u16, false) }
+}
+#[inline] pub fn sat_sub_u16(a: u16, b: u16) -> (u16, bool) {
+    if a >= b { (a - b, false) } else { (0, true) }
+}
+#[inline] pub fn sat_add_i16(a: i16, b: i16) -> (i16, bool) {
+    let s = a as i32 + b as i32;
+    if s > i16::MAX as i32 { (i16::MAX, true) }
+    else if s < i16::MIN as i32 { (i16::MIN, true) }
+    else { (s as i16, false) }
+}
+#[inline] pub fn sat_sub_i16(a: i16, b: i16) -> (i16, bool) {
+    let s = a as i32 - b as i32;
+    if s > i16::MAX as i32 { (i16::MAX, true) }
+    else if s < i16::MIN as i32 { (i16::MIN, true) }
+    else { (s as i16, false) }
+}
+
+#[inline] pub fn sat_add_u32(a: u32, b: u32) -> (u32, bool) {
+    let s = a as u64 + b as u64;
+    if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }
+}
+#[inline] pub fn sat_sub_u32(a: u32, b: u32) -> (u32, bool) {
+    if a >= b { (a - b, false) } else { (0, true) }
+}
+#[inline] pub fn sat_add_i32(a: i32, b: i32) -> (i32, bool) {
+    let s = a as i64 + b as i64;
+    if s > i32::MAX as i64 { (i32::MAX, true) }
+    else if s < i32::MIN as i64 { (i32::MIN, true) }
+    else { (s as i32, false) }
+}
+#[inline] pub fn sat_sub_i32(a: i32, b: i32) -> (i32, bool) {
+    let s = a as i64 - b as i64;
+    if s > i32::MAX as i64 { (i32::MAX, true) }
+    else if s < i32::MIN as i64 { (i32::MIN, true) }
+    else { (s as i32, false) }
+}
+
+// Pack-with-saturation helpers — clamp a wider integer to the narrower type.
+#[inline] pub fn sat_i16_to_i8(v: i16) -> (i8, bool) {
+    if v > i8::MAX as i16 { (i8::MAX, true) }
+    else if v < i8::MIN as i16 { (i8::MIN, true) }
+    else { (v as i8, false) }
+}
+#[inline] pub fn sat_i16_to_u8(v: i16) -> (u8, bool) {
+    if v < 0 { (0, true) }
+    else if v > u8::MAX as i16 { (u8::MAX, true) }
+    else { (v as u8, false) }
+}
+#[inline] pub fn sat_u16_to_u8(v: u16) -> (u8, bool) {
+    if v > u8::MAX as u16 { (u8::MAX, true) } else { (v as u8, false) }
+}
+#[inline] pub fn sat_i32_to_i16(v: i32) -> (i16, bool) {
+    if v > i16::MAX as i32 { (i16::MAX, true) }
+    else if v < i16::MIN as i32 { (i16::MIN, true) }
+    else { (v as i16, false) }
+}
+#[inline] pub fn sat_i32_to_u16(v: i32) -> (u16, bool) {
+    if v < 0 { (0, true) }
+    else if v > u16::MAX as i32 { (u16::MAX, true) }
+    else { (v as u16, false) }
+}
+#[inline] pub fn sat_u32_to_u16(v: u32) -> (u16, bool) {
+    if v > u16::MAX as u32 { (u16::MAX, true) } else { (v as u16, false) }
+}
+#[inline] pub fn sat_i64_to_i32(v: i64) -> (i32, bool) {
+    if v > i32::MAX as i64 { (i32::MAX, true) }
+    else if v < i32::MIN as i64 { (i32::MIN, true) }
+    else { (v as i32, false) }
+}
+#[inline] pub fn sat_i64_to_u32(v: i64) -> (u32, bool) {
+    if v < 0 { (0, true) }
+    else if v > u32::MAX as i64 { (u32::MAX, true) }
+    else { (v as u32, false) }
+}
+
+// ─── Averages ──────────────────────────────────────────────────────────────
+// PPC avg is rounded up: (a + b + 1) / 2.
+#[inline] pub fn avg_u8(a: u8, b: u8) -> u8 {
+    ((a as u16 + b as u16 + 1) >> 1) as u8
+}
+#[inline] pub fn avg_u16(a: u16, b: u16) -> u16 {
+    ((a as u32 + b as u32 + 1) >> 1) as u16
+}
+#[inline] pub fn avg_u32(a: u32, b: u32) -> u32 {
+    ((a as u64 + b as u64 + 1) >> 1) as u32
+}
+#[inline] pub fn avg_i8(a: i8, b: i8) -> i8 {
+    ((a as i32 + b as i32 + 1) >> 1) as i8
+}
+#[inline] pub fn avg_i16(a: i16, b: i16) -> i16 {
+    ((a as i32 + b as i32 + 1) >> 1) as i16
+}
+#[inline] pub fn avg_i32(a: i32, b: i32) -> i32 {
+    ((a as i64 + b as i64 + 1) >> 1) as i32
+}
+
+// ─── NaN-aware f32 min/max for vmaxfp / vminfp ────────────────────────────
+//
+// Altivec PEM: "If either element of vA or vB is a NaN, the corresponding
+// element of vD is set to the quiet NaN form of that NaN". Rust's `>` / `<`
+// comparison with NaN always returns false, so `if a > b { a } else { b }`
+// would silently pick `b` whenever `a` is NaN — losing NaN propagation.
+
+#[inline]
+pub fn max_nan(a: f32, b: f32) -> f32 {
+    if a.is_nan() { quiet_nan(a) }
+    else if b.is_nan() { quiet_nan(b) }
+    else if a > b { a } else { b }
+}
+
+#[inline]
+pub fn min_nan(a: f32, b: f32) -> f32 {
+    if a.is_nan() { quiet_nan(a) }
+    else if b.is_nan() { quiet_nan(b) }
+    else if a < b { a } else { b }
+}
+
+/// Convert an SNaN to QNaN by setting the high mantissa bit. A QNaN is
+/// returned unchanged.
+#[inline]
+pub fn quiet_nan(x: f32) -> f32 {
+    if !x.is_nan() { return x; }
+    f32::from_bits(x.to_bits() | 0x0040_0000)
+}
+
+/// Flush a subnormal f32 to ±0 (preserving the sign). Used by vmaddfp family,
+/// vctsxs / vctuxs, and any instruction whose AltiVec definition specifies
+/// input-side denormal flushing regardless of VSCR[NJ].
+#[inline]
+pub fn flush_denorm(x: f32) -> f32 {
+    if x.is_subnormal() {
+        if x.is_sign_negative() { -0.0 } else { 0.0 }
+    } else {
+        x
+    }
+}
+
+// ─── Float ⇄ fixed-point conversions (scaled by 2^scale_bits) ─────────────
+//
+// vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec.
+#[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) {
+    if x.is_nan() { return (0, true); }
+    let x = flush_denorm(x);
+    let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
+    if scaled >= i32::MAX as f64 { return (i32::MAX, true); }
+    if scaled <= i32::MIN as f64 { return (i32::MIN, true); }
+    (scaled.trunc() as i32, false)
+}
+#[inline] pub fn cvt_f32_to_u32_sat(x: f32, scale_bits: u32) -> (u32, bool) {
+    if x.is_nan() { return (0, true); }
+    let x = flush_denorm(x);
+    let scaled = (x as f64) * ((1u64 << scale_bits) as f64);
+    if scaled < 0.0 { return (0, true); }
+    if scaled > u32::MAX as f64 { return (u32::MAX, true); }
+    (scaled.trunc() as u32, false)
+}
+#[inline] pub fn cvt_i32_to_f32(v: i32, scale_bits: u32) -> f32 {
+    (v as f64 / (1u64 << scale_bits) as f64) as f32
+}
+#[inline] pub fn cvt_u32_to_f32(v: u32, scale_bits: u32) -> f32 {
+    (v as f64 / (1u64 << scale_bits) as f64) as f32
+}
+
+// ─── Unaligned vector load/store ──────────────────────────────────────────
+//
+// lvlx/lvrx and stvlx/stvrx combine to perform any unaligned 16-byte access:
+//   lvlx(EA) | lvrx(EA + 16)   loads 16 bytes starting at unaligned EA.
+//   stvlx(EA); stvrx(EA + 16)   stores 16 bytes starting at unaligned EA.
+//
+// Semantics per the AltiVec manual (and xenia-canary ppc_emit_memory.cc):
+//   lvlx: shift = EA & 0xF, n = 16 - shift. Loads mem[EA..EA+n] into
+//         lanes VR[0..n], zeros VR[n..16].
+//   lvrx: shift = EA & 0xF. If shift == 0, VR = 0. Otherwise loads
+//         mem[EA-shift..EA] into lanes VR[16-shift..16], zeros VR[0..16-shift].
+//   stvlx / stvrx are the symmetric stores.
+//
+// `Vec128::bytes[0]` is the most significant byte (PPC lane 0 in BE view).
+
+pub fn load_vector_left(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
+    let shift = (ea & 0xF) as usize;
+    let n = 16 - shift;
+    let mut bytes = [0u8; 16];
+    for i in 0..n {
+        bytes[i] = mem.read_u8(ea.wrapping_add(i as u32));
+    }
+    Vec128::from_bytes(bytes)
+}
+
+pub fn load_vector_right(mem: &dyn MemoryAccess, ea: u32) -> Vec128 {
+    let shift = (ea & 0xF) as usize;
+    if shift == 0 { return Vec128::ZERO; }
+    let base = ea & !0xFu32;
+    let mut bytes = [0u8; 16];
+    for i in 0..shift {
+        bytes[16 - shift + i] = mem.read_u8(base.wrapping_add(i as u32));
+    }
+    Vec128::from_bytes(bytes)
+}
+
+pub fn store_vector_left(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
+    let shift = (ea & 0xF) as usize;
+    let n = 16 - shift;
+    let b = v.as_bytes();
+    for i in 0..n {
+        mem.write_u8(ea.wrapping_add(i as u32), b[i]);
+    }
+}
+
+pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) {
+    let shift = (ea & 0xF) as usize;
+    if shift == 0 { return; }
+    let base = ea & !0xFu32;
+    let b = v.as_bytes();
+    for i in 0..shift {
+        mem.write_u8(base.wrapping_add(i as u32), b[16 - shift + i]);
+    }
+}
+
+// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ─────────────────────────
+// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel.
+// vupkhpx / vupklpx reverse the operation.
+//
+// Format: input 32-bit word holds
+//     bits 0-6: unused (0)
+//     bit 7:    alpha-select (→ bit 15 of output)
+//     bits 8-15:  R (top 5 bits kept)
+//     bits 16-23: G (top 5 bits kept)
+//     bits 24-31: B (top 5 bits kept)
+// Output 16-bit word:
+//     bit 15:   A (from input bit 7)
+//     bits 10-14: R
+//     bits 5-9:   G
+//     bits 0-4:   B
+
+#[inline] pub fn pack_pixel_555(input: u32) -> u16 {
+    let a = (input >> 7) & 0x1;
+    let r = (input >> 8) & 0xFF;
+    let g = (input >> 16) & 0xFF;
+    let b = (input >> 24) & 0xFF;
+    ((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16
+}
+
+#[inline] pub fn unpack_pixel_555(input: u16) -> u32 {
+    let input = input as u32;
+    let a = (input >> 15) & 0x1;
+    let r = (input >> 10) & 0x1F;
+    let g = (input >> 5) & 0x1F;
+    let b = input & 0x1F;
+    // Sign-extend A and replicate 5-bit RGB into the top of each byte.
+    let a8 = if a != 0 { 0xFFu32 } else { 0 };
+    let r8 = (r << 3) | (r >> 2);
+    let g8 = (g << 3) | (g >> 2);
+    let b8 = (b << 3) | (b >> 2);
+    (a8 << 24) | (r8 << 16) | (g8 << 8) | b8
+}
+
+// ─── VMX128 D3D pack/unpack dispatch ──────────────────────────────────────
+// `vpkd3d128` / `vupkd3d128` encode a small enum in the instruction word
+// (VX128_4 immediate field). The exact enum lives in canary's
+// ppc_emit_altivec.cc under PACK_TYPE_*; titles usually touch D3DCOLOR
+// (type 0) and a handful of texture-coordinate variants.
+//
+// Rather than risk getting a rarely-used sub-case wrong, we implement the
+// common types and fall back to a warning + pass-through for unknown types.
+// Returning the VB register value unchanged is always preferable to emitting
+// StepResult::Unimplemented because it keeps the interpreter running.
+
+/// Pack-type encoding of `vpkd3d128` / `vupkd3d128`.
+///
+/// The immediate field lives at PPC bits 16-22 (VX128_3/4 IMM, 7 bits).
+/// Canary decodes `type = IMM >> 2` (top 5 bits) and `pack = IMM & 0x3`
+/// (low 2 bits, used only by `vpkd3d128` to select output-slot layout).
+/// Valid `type` values are 0..=6 per `ppc_emit_altivec.cc:2095-2118`:
+///
+/// | id | canary name       | format                                |
+/// |----|-------------------|---------------------------------------|
+/// |  0 | VPACK_D3DCOLOR    | 4 f32 [0,1] ↔ ARGB8                   |
+/// |  1 | VPACK_NORMSHORT2  | 2 f32 [-1,1] ↔ 2× signed-normalized i16 |
+/// |  2 | VPACK_NORMPACKED32| 4 f32 [-1,1] ↔ UINT_2101010 (w:2,z:10,y:10,x:10) |
+/// |  3 | VPACK_FLOAT16_2   | 2 f32 ↔ 2× fp16                       |
+/// |  4 | VPACK_NORMSHORT4  | 4 f32 [-1,1] ↔ 4× signed-normalized i16 |
+/// |  5 | VPACK_FLOAT16_4   | 4 f32 ↔ 4× fp16                       |
+/// |  6 | VPACK_NORMPACKED64| 4 f32 [-1,1] ↔ ULONG_4202020 (w:4,z:20,y:20,x:20) |
+///
+/// Prior (M3-pre) this enum listed made-up "Normal16"/"Normal8"/"UByteN4"
+/// variants that didn't match canary; the immediate extraction was also
+/// wrong (LSB-numbered `>>6 & 0x7` instead of MSB-numbered `>>11 & 0x1F`
+/// against a 7-bit IMM field). M3 fixes both.
+#[derive(Debug, Clone, Copy)]
+pub enum D3dPackType {
+    D3dColor,
+    NormShort2,
+    NormPacked32,
+    Float16_2,
+    NormShort4,
+    Float16_4,
+    NormPacked64,
+    Other(u32),
+}
+
+impl D3dPackType {
+    /// Decode the `type` bits extracted from the VX128_3/4 IMM field via
+    /// canary's `IMM >> 2` convention (i.e. the caller has already divided
+    /// out the 2-bit `pack` subfield).
+    pub fn from_immediate(type_bits: u32) -> Self {
+        match type_bits {
+            0 => Self::D3dColor,
+            1 => Self::NormShort2,
+            2 => Self::NormPacked32,
+            3 => Self::Float16_2,
+            4 => Self::NormShort4,
+            5 => Self::Float16_4,
+            6 => Self::NormPacked64,
+            other => Self::Other(other),
+        }
+    }
+}
+
+/// Pack an f32x4 vector of [R, G, B, A] in [0.0, 1.0] into a single D3DCOLOR
+/// value in lane 3 of the output.
+pub fn pack_d3dcolor(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let to_byte = |x: f32| -> u32 {
+        let c = x.clamp(0.0, 1.0) * 255.0;
+        (c + 0.5) as u32 & 0xFF
+    };
+    // D3DCOLOR is A,R,G,B in that byte order inside a u32.
+    let word = (to_byte(f[3]) << 24) | (to_byte(f[0]) << 16) | (to_byte(f[1]) << 8) | to_byte(f[2]);
+    Vec128::from_u32x4(0, 0, 0, word)
+}
+
+/// Unpack a D3DCOLOR value (in lane 3 of the input) into an f32x4 [R, G, B, A].
+pub fn unpack_d3dcolor(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    let a = ((word >> 24) & 0xFF) as f32 / 255.0;
+    let r = ((word >> 16) & 0xFF) as f32 / 255.0;
+    let g = ((word >> 8) & 0xFF) as f32 / 255.0;
+    let b = (word & 0xFF) as f32 / 255.0;
+    Vec128::from_f32x4(r, g, b, a)
+}
+
+// ───────────────────────────────────────────────────────────────────────
+// First-Pixels M3 — pack/unpack for the remaining canary pack types.
+//
+// Conventions shared across all helpers:
+//  * Input-to-`unpack_*` (packed data) lives in the *source* lane position
+//    canary's HIR assumes: canonically the 32-bit word is in lane 3 and
+//    the 64-bit value straddles lanes 2-3. We match that so the existing
+//    D3DCOLOR helpers' 3-lane convention is preserved across the whole
+//    pack-type family.
+//  * Output-from-`pack_*` sits in the same lane(s). The caller usually
+//    follows with a permute to move it elsewhere (the VX128_4 `pack`
+//    subfield controls that in `vpkd3d128`).
+//  * Range semantics match canary: normalized types use `max` = (1<<N-1)-1
+//    for signed, clamp before rounding.
+// ───────────────────────────────────────────────────────────────────────
+
+#[inline]
+fn norm_to_i16(x: f32) -> i16 {
+    let c = x.clamp(-1.0, 1.0) * 32767.0;
+    // Round half away from zero, matching canary's `vcfsx` semantics.
+    let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
+    r.clamp(-32768, 32767) as i16
+}
+
+#[inline]
+fn i16_to_norm(s: i16) -> f32 {
+    (s as f32) / 32767.0
+}
+
+/// **NORMSHORT2** — 2 f32s in [-1, 1] → two 16-bit signed-normalized
+/// shorts packed as `(x << 16) | y` in lane 3 (high 32 bits of the word
+/// hold X; low 16 hold Y). Output lanes 0..=2 are zero-filled.
+pub fn pack_normshort2(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = norm_to_i16(f[0]) as u16 as u32;
+    let y = norm_to_i16(f[1]) as u16 as u32;
+    Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
+}
+
+pub fn unpack_normshort2(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    let x = i16_to_norm((word >> 16) as i16);
+    let y = i16_to_norm(word as i16);
+    Vec128::from_f32x4(x, y, 0.0, 1.0)
+}
+
+/// **NORMSHORT4** — 4 f32s in [-1, 1] → four 16-bit signed-normalized
+/// shorts packed across lanes 2-3 (big-endian dword order: X in the
+/// high word of lane 2, Y low of lane 2, Z high of lane 3, W low of lane
+/// 3).
+pub fn pack_normshort4(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = norm_to_i16(f[0]) as u16 as u32;
+    let y = norm_to_i16(f[1]) as u16 as u32;
+    let z = norm_to_i16(f[2]) as u16 as u32;
+    let w = norm_to_i16(f[3]) as u16 as u32;
+    Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
+}
+
+pub fn unpack_normshort4(v: Vec128) -> Vec128 {
+    let hi = v.u32x4(2);
+    let lo = v.u32x4(3);
+    let x = i16_to_norm((hi >> 16) as i16);
+    let y = i16_to_norm(hi as i16);
+    let z = i16_to_norm((lo >> 16) as i16);
+    let w = i16_to_norm(lo as i16);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+/// **NORMPACKED32** — UINT_2101010 layout, 4 f32s in [-1, 1] packed into
+/// 32 bits in lane 3. Per canary's comment `2_10_10_10 w_z_y_x`: the
+/// high 2 bits hold W (signed 2-bit, -2..=1), then Z/Y/X each use 10
+/// signed-normalized bits.
+pub fn pack_normpacked32(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    #[inline]
+    fn n10(x: f32) -> u32 {
+        let c = x.clamp(-1.0, 1.0) * 511.0;
+        let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
+        (r.clamp(-512, 511) as i32 as u32) & 0x3FF
+    }
+    #[inline]
+    fn n2(x: f32) -> u32 {
+        let c = x.clamp(-1.0, 1.0) * 1.0;
+        let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 };
+        (r.clamp(-2, 1) as i32 as u32) & 0x3
+    }
+    let x = n10(f[0]);
+    let y = n10(f[1]);
+    let z = n10(f[2]);
+    let w = n2(f[3]);
+    let word = (w << 30) | (z << 20) | (y << 10) | x;
+    Vec128::from_u32x4(0, 0, 0, word)
+}
+
+pub fn unpack_normpacked32(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    #[inline]
+    fn u10_to_norm(bits: u32) -> f32 {
+        // Sign-extend the 10-bit field then normalize.
+        let s = ((bits & 0x3FF) as i32) << 22 >> 22;
+        (s as f32) / 511.0
+    }
+    #[inline]
+    fn u2_to_norm(bits: u32) -> f32 {
+        let s = ((bits & 0x3) as i32) << 30 >> 30;
+        (s as f32).clamp(-1.0, 1.0)
+    }
+    let x = u10_to_norm(word);
+    let y = u10_to_norm(word >> 10);
+    let z = u10_to_norm(word >> 20);
+    let w = u2_to_norm(word >> 30);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+/// **NORMPACKED64** — ULONG_4202020, 4 f32s in [-1, 1] packed into 64
+/// bits across lanes 2-3. Per canary's comment `4_20_20_20 w_z_y_x`:
+/// the high 4 bits of the dword hold W (signed 4-bit); the remaining 60
+/// bits hold 3× 20-bit signed-normalized Z/Y/X. Rare outside very few
+/// titles (canary notes 54540829).
+pub fn pack_normpacked64(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    #[inline]
+    fn n20(x: f32) -> u64 {
+        let c = x.clamp(-1.0, 1.0) * 524287.0; // 2^19 - 1
+        let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
+        (r.clamp(-524288, 524287) as i64 as u64) & 0xF_FFFF
+    }
+    #[inline]
+    fn n4(x: f32) -> u64 {
+        let c = x.clamp(-1.0, 1.0) * 7.0;
+        let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 };
+        (r.clamp(-8, 7) as i64 as u64) & 0xF
+    }
+    let x = n20(f[0]);
+    let y = n20(f[1]);
+    let z = n20(f[2]);
+    let w = n4(f[3]);
+    let dw: u64 = (w << 60) | (z << 40) | (y << 20) | x;
+    Vec128::from_u32x4(0, 0, (dw >> 32) as u32, dw as u32)
+}
+
+pub fn unpack_normpacked64(v: Vec128) -> Vec128 {
+    let hi = v.u32x4(2) as u64;
+    let lo = v.u32x4(3) as u64;
+    let dw = (hi << 32) | lo;
+    #[inline]
+    fn u20_to_norm(bits: u64) -> f32 {
+        let s = ((bits & 0xF_FFFF) as i64) << 44 >> 44;
+        (s as f32) / 524287.0
+    }
+    #[inline]
+    fn u4_to_norm(bits: u64) -> f32 {
+        let s = ((bits & 0xF) as i64) << 60 >> 60;
+        (s as f32) / 7.0
+    }
+    let x = u20_to_norm(dw);
+    let y = u20_to_norm(dw >> 20);
+    let z = u20_to_norm(dw >> 40);
+    let w = u4_to_norm(dw >> 60);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+/// IEEE 754 half-precision float pack/unpack — used by both FLOAT16_2
+/// and FLOAT16_4. No FMA quirks involved; we go via `f32::to_bits` and
+/// manual bit-twiddling (the stable-Rust `f16` type isn't available
+/// yet).
+#[inline]
+fn f32_to_f16_bits(f: f32) -> u16 {
+    let bits = f.to_bits();
+    let sign = ((bits >> 31) & 0x1) as u16;
+    let exp = ((bits >> 23) & 0xFF) as i32;
+    let mant = bits & 0x7FFFFF;
+    // Handle the easy cases first.
+    if exp == 0xFF {
+        // NaN or infinity.
+        let half_exp = 0x1F;
+        let half_mant = if mant != 0 { 0x200 } else { 0 }; // quiet NaN / zero mantissa for Inf
+        return (sign << 15) | (half_exp << 10) | half_mant;
+    }
+    let unbiased_exp = exp - 127;
+    if unbiased_exp >= 16 {
+        // Overflow → infinity.
+        return (sign << 15) | (0x1F << 10);
+    }
+    if unbiased_exp <= -15 {
+        // Denormal or zero. Compute the shift and subnormal mantissa;
+        // anything too small flushes to signed zero.
+        if unbiased_exp < -24 {
+            return sign << 15;
+        }
+        let shift = -14 - unbiased_exp as i32; // amount to shift the implicit-1'd mantissa
+        let full_mant = 0x800000 | mant; // 24 bits with implicit leading 1
+        let half_mant = (full_mant >> (shift + 13)) as u16;
+        return (sign << 15) | half_mant;
+    }
+    let half_exp = ((unbiased_exp + 15) as u16) & 0x1F;
+    let half_mant = (mant >> 13) as u16;
+    (sign << 15) | (half_exp << 10) | half_mant
+}
+
+#[inline]
+fn f16_bits_to_f32(h: u16) -> f32 {
+    let sign = ((h >> 15) & 0x1) as u32;
+    let exp = ((h >> 10) & 0x1F) as i32;
+    let mant = (h & 0x3FF) as u32;
+    let bits = if exp == 0x1F {
+        // NaN or infinity.
+        let f32_exp = 0xFFu32;
+        let f32_mant = if mant != 0 { 0x400000 } else { 0 };
+        (sign << 31) | (f32_exp << 23) | f32_mant
+    } else if exp == 0 && mant == 0 {
+        // Signed zero.
+        sign << 31
+    } else if exp == 0 {
+        // Subnormal — renormalize.
+        let mut e = -14i32;
+        let mut m = mant;
+        while (m & 0x400) == 0 {
+            m <<= 1;
+            e -= 1;
+        }
+        let f32_exp = ((e + 127) as u32) & 0xFF;
+        let f32_mant = (m & 0x3FF) << 13;
+        (sign << 31) | (f32_exp << 23) | f32_mant
+    } else {
+        let f32_exp = ((exp - 15 + 127) as u32) & 0xFF;
+        let f32_mant = mant << 13;
+        (sign << 31) | (f32_exp << 23) | f32_mant
+    };
+    f32::from_bits(bits)
+}
+
+/// **FLOAT16_2** — two 32-bit floats → two half-floats packed into one
+/// 32-bit word (X in high 16 bits of lane 3, Y in low 16).
+pub fn pack_float16_2(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = f32_to_f16_bits(f[0]) as u32;
+    let y = f32_to_f16_bits(f[1]) as u32;
+    Vec128::from_u32x4(0, 0, 0, (x << 16) | y)
+}
+
+pub fn unpack_float16_2(v: Vec128) -> Vec128 {
+    let word = v.u32x4(3);
+    let x = f16_bits_to_f32((word >> 16) as u16);
+    let y = f16_bits_to_f32(word as u16);
+    Vec128::from_f32x4(x, y, 0.0, 1.0)
+}
+
+/// **FLOAT16_4** — four 32-bit floats → four half-floats packed across
+/// 64 bits (lanes 2-3).
+pub fn pack_float16_4(v: Vec128) -> Vec128 {
+    let f = v.as_f32x4();
+    let x = f32_to_f16_bits(f[0]) as u32;
+    let y = f32_to_f16_bits(f[1]) as u32;
+    let z = f32_to_f16_bits(f[2]) as u32;
+    let w = f32_to_f16_bits(f[3]) as u32;
+    Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w)
+}
+
+pub fn unpack_float16_4(v: Vec128) -> Vec128 {
+    let hi = v.u32x4(2);
+    let lo = v.u32x4(3);
+    let x = f16_bits_to_f32((hi >> 16) as u16);
+    let y = f16_bits_to_f32(hi as u16);
+    let z = f16_bits_to_f32((lo >> 16) as u16);
+    let w = f16_bits_to_f32(lo as u16);
+    Vec128::from_f32x4(x, y, z, w)
+}
+
+// ─── CR6 helpers used by integer compares ─────────────────────────────────
+// vcmp*. (record-form) updates CR6 in a compressed form:
+//   CR6 = {all-true, 0, all-false, 0}
+// where each bit reflects the per-lane mask across the whole register.
+
+#[inline] pub fn cr6_flags_from_mask(mask: Vec128) -> (bool, bool) {
+    let b = mask.as_bytes();
+    let mut any_set = false;
+    let mut any_clear = false;
+    for &byte in b.iter() {
+        if byte != 0 { any_set = true; }
+        if byte != 0xFF { any_clear = true; }
+    }
+    let all_true = !any_clear;
+    let all_false = !any_set;
+    (all_true, all_false)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::cell::Cell;
+
+    struct TestMem { data: Box<[Cell<u8>]> }
+    impl TestMem {
+        fn new(size: usize) -> Self {
+            Self { data: (0..size).map(|_| Cell::new(0)).collect() }
+        }
+    }
+    impl MemoryAccess for TestMem {
+        fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
+        fn read_u16(&self, a: u32) -> u16 {
+            u16::from_be_bytes([self.data[a as usize].get(), self.data[a as usize + 1].get()])
+        }
+        fn read_u32(&self, a: u32) -> u32 {
+            let a = a as usize;
+            u32::from_be_bytes([
+                self.data[a].get(), self.data[a+1].get(),
+                self.data[a+2].get(), self.data[a+3].get(),
+            ])
+        }
+        fn read_u64(&self, a: u32) -> u64 {
+            let a = a as usize;
+            u64::from_be_bytes([
+                self.data[a].get(), self.data[a+1].get(),
+                self.data[a+2].get(), self.data[a+3].get(),
+                self.data[a+4].get(), self.data[a+5].get(),
+                self.data[a+6].get(), self.data[a+7].get(),
+            ])
+        }
+        fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
+        fn write_u16(&self, a: u32, v: u16) {
+            let b = v.to_be_bytes();
+            self.data[a as usize].set(b[0]);
+            self.data[a as usize + 1].set(b[1]);
+        }
+        fn write_u32(&self, a: u32, v: u32) {
+            let b = v.to_be_bytes(); let a = a as usize;
+            for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
+        }
+        fn write_u64(&self, a: u32, v: u64) {
+            let b = v.to_be_bytes(); let a = a as usize;
+            for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); }
+        }
+        fn translate(&self, _a: u32) -> Option<*const u8> { None }
+        fn translate_mut(&self, _a: u32) -> Option<*mut u8> { None }
+    }
+
+    #[test]
+    fn lvlx_lvrx_round_trip() {
+        let m = TestMem::new(0x40);
+        for i in 0..0x30 { m.data[i].set((i as u8).wrapping_add(0x10)); }
+        // Unaligned load from 0x13 should combine lvlx(0x13) | lvrx(0x23).
+        let lo = load_vector_left(&m, 0x13);
+        let hi = load_vector_right(&m, 0x23);
+        let mut combined = [0u8; 16];
+        let lob = lo.as_bytes();
+        let hib = hi.as_bytes();
+        for i in 0..16 { combined[i] = lob[i] | hib[i]; }
+        for i in 0..16 {
+            assert_eq!(combined[i], m.data[0x13 + i].get(), "lane {}", i);
+        }
+    }
+
+    #[test]
+    fn lvlx_aligned_is_full_load() {
+        let m = TestMem::new(0x20);
+        for i in 0..0x20 { m.data[i].set(i as u8); }
+        let v = load_vector_left(&m, 0x10);
+        let b = v.as_bytes();
+        for i in 0..16 { assert_eq!(b[i], 0x10 + i as u8); }
+    }
+
+    #[test]
+    fn lvrx_aligned_is_zero() {
+        let m = TestMem::new(0x20);
+        let v = load_vector_right(&m, 0x10);
+        assert_eq!(v.as_bytes(), [0u8; 16]);
+    }
+
+    #[test]
+    fn sat_add_signed_overflow() {
+        assert_eq!(sat_add_i8(120, 10), (127, true));
+        assert_eq!(sat_add_i8(-120, -10), (-128, true));
+        assert_eq!(sat_add_i8(1, 2), (3, false));
+    }
+
+    #[test]
+    fn sat_sub_unsigned_underflow() {
+        assert_eq!(sat_sub_u8(5, 10), (0, true));
+        assert_eq!(sat_sub_u8(10, 5), (5, false));
+    }
+
+    #[test]
+    fn pack_unpack_pixel_555() {
+        let encoded = pack_pixel_555(0x80_F8_F8_F8);
+        assert_eq!(encoded & 0x8000, 0x8000);
+        let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F);
+        assert_eq!(w & 0xFF000000, 0xFF000000);
+    }
+
+    // ─── First-Pixels M3 pack/unpack roundtrip tests ───
+
+    /// Quantization error tolerance for N-bit signed normalized values.
+    /// `1.0 / ((1 << (bits - 1)) - 1)` is the step size.
+    fn tol_normalized(bits: u32) -> f32 {
+        1.0 / ((1u32 << (bits - 1)) - 1) as f32
+    }
+
+    #[test]
+    fn normshort2_roundtrip() {
+        let v = Vec128::from_f32x4(0.5, -0.75, 0.0, 0.0);
+        let packed = pack_normshort2(v);
+        let back = unpack_normshort2(packed).as_f32x4();
+        let tol = tol_normalized(16);
+        assert!((back[0] - 0.5).abs() < tol,  "x got {}", back[0]);
+        assert!((back[1] - -0.75).abs() < tol, "y got {}", back[1]);
+        assert_eq!(back[2], 0.0);
+        assert_eq!(back[3], 1.0);
+    }
+
+    #[test]
+    fn normshort4_roundtrip_extremes() {
+        let v = Vec128::from_f32x4(1.0, -1.0, 0.0, 0.25);
+        let packed = pack_normshort4(v);
+        let back = unpack_normshort4(packed).as_f32x4();
+        let tol = tol_normalized(16);
+        assert!((back[0] - 1.0).abs() < tol);
+        assert!((back[1] - -1.0).abs() < tol);
+        assert!((back[2] - 0.0).abs() < tol);
+        assert!((back[3] - 0.25).abs() < tol);
+    }
+
+    #[test]
+    fn normpacked32_roundtrip() {
+        let v = Vec128::from_f32x4(0.5, -0.5, 0.9, -1.0);
+        let packed = pack_normpacked32(v);
+        let back = unpack_normpacked32(packed).as_f32x4();
+        let tol10 = tol_normalized(10);
+        let tol2 = tol_normalized(2);
+        assert!((back[0] - 0.5).abs() < tol10,  "x got {}", back[0]);
+        assert!((back[1] - -0.5).abs() < tol10, "y got {}", back[1]);
+        assert!((back[2] - 0.9).abs() < tol10,  "z got {}", back[2]);
+        // 2-bit signed quantizes to {-1, -0.5-ish, 0, 0.5-ish}; tolerance
+        // is the full step.
+        assert!((back[3] - -1.0).abs() < 2.0 * tol2, "w got {}", back[3]);
+    }
+
+    #[test]
+    fn normpacked64_roundtrip() {
+        let v = Vec128::from_f32x4(0.5, -0.25, 0.75, 0.5);
+        let packed = pack_normpacked64(v);
+        let back = unpack_normpacked64(packed).as_f32x4();
+        let tol20 = tol_normalized(20);
+        let tol4 = tol_normalized(4);
+        assert!((back[0] - 0.5).abs() < tol20,   "x got {}", back[0]);
+        assert!((back[1] - -0.25).abs() < tol20, "y got {}", back[1]);
+        assert!((back[2] - 0.75).abs() < tol20,  "z got {}", back[2]);
+        assert!((back[3] - 0.5).abs() < tol4,    "w got {}", back[3]);
+    }
+
+    #[test]
+    fn float16_2_roundtrip_normals() {
+        // Half has ~3 decimal digits of precision. Pick values that
+        // survive conversion cleanly: powers of 2 + simple fractions.
+        let v = Vec128::from_f32x4(1.0, -2.5, 0.0, 0.0);
+        let packed = pack_float16_2(v);
+        let back = unpack_float16_2(packed).as_f32x4();
+        assert_eq!(back[0], 1.0);
+        assert_eq!(back[1], -2.5);
+        assert_eq!(back[2], 0.0);
+        assert_eq!(back[3], 1.0);
+    }
+
+    #[test]
+    fn float16_4_roundtrip_normals() {
+        let v = Vec128::from_f32x4(0.5, -3.0, 16.0, -0.125);
+        let packed = pack_float16_4(v);
+        let back = unpack_float16_4(packed).as_f32x4();
+        assert_eq!(back[0], 0.5);
+        assert_eq!(back[1], -3.0);
+        assert_eq!(back[2], 16.0);
+        assert_eq!(back[3], -0.125);
+    }
+
+    #[test]
+    fn float16_handles_zero_and_infinity() {
+        // Zero should survive.
+        assert_eq!(f16_bits_to_f32(f32_to_f16_bits(0.0)), 0.0);
+        assert_eq!(f16_bits_to_f32(f32_to_f16_bits(-0.0)).to_bits(), (-0.0f32).to_bits());
+        // +inf.
+        let inf_back = f16_bits_to_f32(f32_to_f16_bits(f32::INFINITY));
+        assert!(inf_back.is_infinite() && inf_back > 0.0);
+        // Overflow → +inf.
+        let overflow_back = f16_bits_to_f32(f32_to_f16_bits(65536.0));
+        assert!(overflow_back.is_infinite());
+    }
+
+    #[test]
+    fn pack_type_enum_maps_canary_values() {
+        use D3dPackType::*;
+        assert!(matches!(D3dPackType::from_immediate(0), D3dColor));
+        assert!(matches!(D3dPackType::from_immediate(1), NormShort2));
+        assert!(matches!(D3dPackType::from_immediate(2), NormPacked32));
+        assert!(matches!(D3dPackType::from_immediate(3), Float16_2));
+        assert!(matches!(D3dPackType::from_immediate(4), NormShort4));
+        assert!(matches!(D3dPackType::from_immediate(5), Float16_4));
+        assert!(matches!(D3dPackType::from_immediate(6), NormPacked64));
+        assert!(matches!(D3dPackType::from_immediate(7), Other(7)));
+    }
+}
diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs
new file mode 100644
index 0000000..6c39d54
--- /dev/null
+++ b/crates/xenia-cpu/tests/disasm_goldens.rs
@@ -0,0 +1,531 @@
+//! Assert-based goldens for the PPC disassembler.
+//!
+//! Each test owns an inline list of `(raw, addr, label)` cases. On a
+//! normal run, the test reads the corresponding fixture JSON and asserts
+//! that `format(decode(raw, addr))` reproduces every field exactly. On
+//! first creation (fixture file missing) or with `REGEN_GOLDENS=1` set,
+//! the test (re)writes the fixture from `format()` output.
+//!
+//! Workflow:
+//! ```sh
+//! cargo test -p xenia-cpu --test disasm_goldens             # assert
+//! REGEN_GOLDENS=1 cargo test -p xenia-cpu --test disasm_goldens   # regen
+//! ```
+//!
+//! The hand-encoded test cases below cover the silent-bug regression
+//! cases that lived in the old println-based `disasm_audit.rs` harness
+//! (now deleted).
+
+use std::path::PathBuf;
+
+use serde::{Deserialize, Serialize};
+
+use xenia_cpu::decoder::decode;
+use xenia_cpu::disasm::format;
+
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+struct GoldenRow {
+    label: String,
+    raw: String,
+    addr: String,
+    mnemonic: String,
+    operands: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    ext_mnemonic: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    ext_operands: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    branch_target: Option<String>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct GoldenFile {
+    rows: Vec<GoldenRow>,
+}
+
+fn fixture_path(name: &str) -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("tests")
+        .join("golden")
+        .join(name)
+}
+
+fn build_rows(cases: &[(u32, u32, &str)]) -> Vec<GoldenRow> {
+    cases
+        .iter()
+        .map(|&(raw, addr, label)| {
+            let d = decode(raw, addr);
+            let t = format(&d);
+            GoldenRow {
+                label: label.to_string(),
+                raw: format!("0x{raw:08X}"),
+                addr: format!("0x{addr:08X}"),
+                mnemonic: t.mnemonic,
+                operands: t.operands,
+                ext_mnemonic: t.ext_mnemonic,
+                ext_operands: t.ext_operands,
+                branch_target: t.branch_target.map(|t| format!("0x{t:08X}")),
+            }
+        })
+        .collect()
+}
+
+/// Compare what `format()` produces against the committed JSON snapshot.
+/// Set `REGEN_GOLDENS=1` to overwrite the snapshot from current output.
+/// Missing snapshot is treated as "first creation": writes and panics so
+/// CI can't accidentally accept blank goldens.
+fn assert_or_regen(fixture_name: &str, cases: &[(u32, u32, &str)]) {
+    let rows = build_rows(cases);
+    let path = fixture_path(fixture_name);
+    let regen = std::env::var("REGEN_GOLDENS").is_ok();
+
+    if regen || !path.exists() {
+        if let Some(parent) = path.parent() {
+            std::fs::create_dir_all(parent).unwrap();
+        }
+        let serialized = serde_json::to_string_pretty(&GoldenFile { rows }).unwrap();
+        std::fs::write(&path, serialized + "\n").unwrap();
+        if !regen {
+            panic!(
+                "Generated fixture {} (was missing). Inspect, commit, then re-run.",
+                path.display()
+            );
+        }
+        return;
+    }
+
+    let src = std::fs::read_to_string(&path).unwrap();
+    let golden: GoldenFile = serde_json::from_str(&src).unwrap();
+    assert_eq!(
+        rows.len(),
+        golden.rows.len(),
+        "row count differs from {} (live={}, fixture={}). Run with REGEN_GOLDENS=1 if the test cases changed intentionally.",
+        path.display(),
+        rows.len(),
+        golden.rows.len()
+    );
+    for (i, (got, expected)) in rows.iter().zip(golden.rows.iter()).enumerate() {
+        assert_eq!(
+            got, expected,
+            "row {} ({}) differs in {}\n  live:    {got:#?}\n  fixture: {expected:#?}",
+            i,
+            expected.label,
+            path.display()
+        );
+    }
+}
+
+// ── Encoding helpers ────────────────────────────────────────────────────────
+// PPC bit numbering: bit 0 is MSB, bit 31 is LSB. Most helpers below emit
+// instructions in canonical hand-readable form: opcode << 26 | <fields>.
+
+#[allow(clippy::too_many_arguments)]
+fn xform_xo3(rd: u32, ra: u32, rb: u32, oe: u32, xo: u32, rc: u32) -> u32 {
+    (31 << 26) | (rd << 21) | (ra << 16) | (rb << 11) | (oe << 10) | (xo << 1) | rc
+}
+
+fn xform_logic(rs: u32, ra: u32, rb: u32, xo: u32, rc: u32) -> u32 {
+    (31 << 26) | (rs << 21) | (ra << 16) | (rb << 11) | (xo << 1) | rc
+}
+
+fn dform(op: u32, rt: u32, ra: u32, imm: i16) -> u32 {
+    (op << 26) | (rt << 21) | (ra << 16) | ((imm as u16) as u32)
+}
+
+fn iform_b(target_disp: i32, aa: u32, lk: u32) -> u32 {
+    // I-form: opcode 18 | LI<<2 | AA<<1 | LK
+    let li = (target_disp as u32) & 0x03FF_FFFC;
+    (18 << 26) | li | (aa << 1) | lk
+}
+
+fn bform_bc(bo: u32, bi: u32, target_disp: i32, aa: u32, lk: u32) -> u32 {
+    // B-form: opcode 16 | BO<<21 | BI<<16 | BD<<2 | AA<<1 | LK
+    let bd = (target_disp as u32) & 0x0000_FFFC;
+    (16 << 26) | (bo << 21) | (bi << 16) | bd | (aa << 1) | lk
+}
+
+fn xlform_bclr(bo: u32, bi: u32, lk: u32) -> u32 {
+    // XL-form: opcode 19 | BO<<21 | BI<<16 | XO=16<<1 | LK
+    (19 << 26) | (bo << 21) | (bi << 16) | (16 << 1) | lk
+}
+
+fn xlform_bcctr(bo: u32, bi: u32, lk: u32) -> u32 {
+    (19 << 26) | (bo << 21) | (bi << 16) | (528 << 1) | lk
+}
+
+fn rlwinm(rs: u32, ra: u32, sh: u32, mb: u32, me: u32, rc: u32) -> u32 {
+    (21 << 26) | (rs << 21) | (ra << 16) | (sh << 11) | (mb << 6) | (me << 1) | rc
+}
+
+fn rldicl(rs: u32, ra: u32, sh: u32, mb: u32, rc: u32) -> u32 {
+    // MD-form, op30 xo=0. sh split: bits 16-20 (high 5) + bit 30 (low bit).
+    // mb split: bits 21-25 (low 5) + bit 26 (high bit).
+    let sh_hi = (sh >> 1) & 0x1F;
+    let sh_lo = sh & 1;
+    let mb_lo = mb & 0x1F;
+    let mb_hi = (mb >> 5) & 1;
+    (30 << 26)
+        | (rs << 21)
+        | (ra << 16)
+        | (sh_hi << 11)
+        | (mb_lo << 6)
+        | (mb_hi << 5)
+        | (0 << 2)
+        | (sh_lo << 1)
+        | rc
+}
+
+fn mfspr(rd: u32, spr: u32) -> u32 {
+    let spr_swapped = ((spr & 0x1F) << 5) | ((spr >> 5) & 0x1F);
+    (31 << 26) | (rd << 21) | (spr_swapped << 11) | (339 << 1)
+}
+
+fn mtspr(rs: u32, spr: u32) -> u32 {
+    let spr_swapped = ((spr & 0x1F) << 5) | ((spr >> 5) & 0x1F);
+    (31 << 26) | (rs << 21) | (spr_swapped << 11) | (467 << 1)
+}
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+#[test]
+fn base_mnemonics() {
+    let cases: &[(u32, u32, &str)] = &[
+        // X-form ALU (Rc and OE bits)
+        (xform_xo3(3, 4, 5, 0, 266, 0), 0x82000000, "add r3,r4,r5"),
+        (xform_xo3(3, 4, 5, 0, 266, 1), 0x82000000, "add. r3,r4,r5"),
+        (xform_xo3(3, 4, 5, 1, 266, 0), 0x82000000, "addo r3,r4,r5"),
+        (xform_xo3(3, 4, 5, 1, 266, 1), 0x82000000, "addo. r3,r4,r5"),
+        (xform_xo3(3, 4, 0, 0, 104, 0), 0x82000000, "neg r3,r4"),
+        (xform_xo3(3, 4, 5, 0, 235, 0), 0x82000000, "mullw r3,r4,r5"),
+        (xform_xo3(3, 4, 5, 0, 491, 0), 0x82000000, "divw r3,r4,r5"),
+        (xform_xo3(3, 4, 5, 0, 75, 1), 0x82000000, "mulhw. r3,r4,r5"),
+        (xform_xo3(3, 4, 5, 0, 11, 1), 0x82000000, "mulhwu. r3,r4,r5"),
+        (xform_xo3(3, 4, 5, 0, 233, 0), 0x82000000, "mulld r3,r4,r5"),
+        // X-form logical
+        (xform_logic(4, 3, 5, 28, 0), 0x82000000, "and r3,r4,r5"),
+        (xform_logic(4, 3, 5, 444, 0), 0x82000000, "or r3,r4,r5 (non-mr: rs!=rb)"),
+        (xform_logic(4, 3, 5, 316, 0), 0x82000000, "xor r3,r4,r5"),
+        (xform_logic(4, 3, 5, 124, 0), 0x82000000, "nor r3,r4,r5"),
+        (xform_logic(4, 3, 5, 476, 0), 0x82000000, "nand r3,r4,r5"),
+        (xform_logic(4, 3, 5, 284, 0), 0x82000000, "eqv r3,r4,r5"),
+        (xform_logic(4, 3, 5, 60, 0), 0x82000000, "andc r3,r4,r5"),
+        (xform_logic(4, 3, 5, 412, 0), 0x82000000, "orc r3,r4,r5"),
+        // X-form shift
+        (xform_logic(4, 3, 5, 24, 0), 0x82000000, "slw r3,r4,r5"),
+        (xform_logic(4, 3, 5, 536, 0), 0x82000000, "srw r3,r4,r5"),
+        (xform_logic(4, 3, 5, 792, 0), 0x82000000, "sraw r3,r4,r5"),
+        (xform_logic(4, 3, 5, 27, 0), 0x82000000, "sld r3,r4,r5"),
+        (xform_logic(4, 3, 5, 539, 0), 0x82000000, "srd r3,r4,r5"),
+        // srawi / sradi (immediate shifts)
+        ((31 << 26) | (4 << 21) | (3 << 16) | (16 << 11) | (824 << 1), 0x82000000, "srawi r3,r4,16"),
+        // Atomics
+        ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (150 << 1) | 1, 0x82000000, "stwcx. r3,r4,r5"),
+        ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (214 << 1) | 1, 0x82000000, "stdcx. r3,r4,r5"),
+        ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1), 0x82000000, "lwarx r3,r4,r5"),
+        ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (84 << 1), 0x82000000, "ldarx r3,r4,r5"),
+        // Compares
+        (dform(11, 0, 3, 16), 0x82000000, "cmpwi cr0, r3, 16"),
+        (dform(11, 2 << 2, 3, 16), 0x82000000, "cmpwi cr2, r3, 16"),
+        (dform(10, 0, 3, 16), 0x82000000, "cmplwi cr0, r3, 16"),
+        ((31 << 26) | (3 << 16) | (4 << 11), 0x82000000, "cmpw r3,r4 in cr0"),
+        ((31 << 26) | (1 << 21) | (3 << 16) | (4 << 11), 0x82000000, "cmpd r3,r4"),
+        ((31 << 26) | (3 << 16) | (4 << 11) | (32 << 1), 0x82000000, "cmplw r3,r4"),
+        // D-form ALU/load/store
+        (dform(14, 3, 1, 16), 0x82000000, "addi r3, r1, 16"),
+        (dform(15, 3, 1, 0x100), 0x82000000, "addis r3, r1, 0x100 (ra!=0)"),
+        (dform(7, 3, 4, 5), 0x82000000, "mulli r3, r4, 5"),
+        (dform(8, 3, 4, 5), 0x82000000, "subfic r3, r4, 5"),
+        (dform(12, 3, 4, 16), 0x82000000, "addic r3, r4, 16"),
+        (dform(13, 3, 4, 16), 0x82000000, "addic. r3, r4, 16"),
+        (dform(24, 3, 4, 0x10), 0x82000000, "ori r4, r3, 0x10 (non-nop)"),
+        (dform(25, 3, 4, 0x10), 0x82000000, "oris r4, r3, 0x10"),
+        (dform(26, 3, 4, 0x10), 0x82000000, "xori r4, r3, 0x10"),
+        (dform(28, 3, 4, 0x10), 0x82000000, "andi. r4, r3, 0x10"),
+        // Loads/stores D-form
+        (dform(32, 5, 1, 0x20), 0x82000000, "lwz r5, 0x20(r1)"),
+        (dform(36, 5, 1, 0x20), 0x82000000, "stw r5, 0x20(r1)"),
+        (dform(34, 5, 1, 0x20), 0x82000000, "lbz r5, 0x20(r1)"),
+        (dform(40, 5, 1, 0x20), 0x82000000, "lhz r5, 0x20(r1)"),
+        (dform(48, 5, 1, 0x20), 0x82000000, "lfs f5, 0x20(r1)"),
+        (dform(50, 5, 1, 0x20), 0x82000000, "lfd f5, 0x20(r1)"),
+        (dform(54, 5, 1, 0x20), 0x82000000, "stfd f5, 0x20(r1)"),
+        // DS-form 64-bit loads
+        ((58u32 << 26) | (5 << 21) | (1 << 16) | 0x20, 0x82000000, "ld r5, 0x20(r1)"),
+        ((62u32 << 26) | (5 << 21) | (1 << 16) | 0x20, 0x82000000, "std r5, 0x20(r1)"),
+        // Sync / barrier (parameterless)
+        ((31 << 26) | (598 << 1), 0x82000000, "sync 0 (extends to sync)"),
+        ((19 << 26) | (150 << 1), 0x82000000, "isync"),
+        ((31 << 26) | (854 << 1), 0x82000000, "eieio"),
+        // Cache hints
+        ((31 << 26) | (1 << 16) | (2 << 11) | (54 << 1), 0x82000000, "dcbst r1, r2"),
+        ((31 << 26) | (1 << 16) | (2 << 11) | (86 << 1), 0x82000000, "dcbf r1, r2"),
+        ((31 << 26) | (1 << 16) | (2 << 11) | (278 << 1), 0x82000000, "dcbt r1, r2"),
+        ((31 << 26) | (1 << 16) | (2 << 11) | (1014 << 1), 0x82000000, "dcbz r1, r2"),
+        ((31 << 26) | (1 << 21) | (1 << 16) | (2 << 11) | (1014 << 1), 0x82000000, "dcbz128 r1, r2"),
+        // CR logical (without simplification triggers)
+        ((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (33 << 1), 0x82000000, "crnor 4,5,6 (no simplify)"),
+        ((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (257 << 1), 0x82000000, "crand 4,5,6"),
+        ((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (449 << 1), 0x82000000, "cror 4,5,6 (no simplify)"),
+        // Trap (no simplification: TO=11 doesn't match the table)
+        ((31 << 26) | (11 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "tw 11, r3, r4 (uncommon TO)"),
+        ((2u32 << 26) | (11 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "tdi 11, r3, 123"),
+        // mtcr (extended): mtcrf 0xFF, r5
+        ((31 << 26) | (5 << 21) | (0xFF << 12) | (144 << 1), 0x82000000, "mtcrf 0xFF, r5 → mtcr"),
+        // mfcr / mfmsr / mtmsr / mtmsrd
+        ((31 << 26) | (5 << 21) | (19 << 1), 0x82000000, "mfcr r5"),
+        ((31 << 26) | (5 << 21) | (83 << 1), 0x82000000, "mfmsr r5"),
+        ((31 << 26) | (5 << 21) | (146 << 1), 0x82000000, "mtmsr r5"),
+        ((31 << 26) | (5 << 21) | (178 << 1), 0x82000000, "mtmsrd r5"),
+        // FPU base
+        ((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (21 << 1), 0x82000000, "fadd f3, f4, f5"),
+        ((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1), 0x82000000, "fsub f3, f4, f5"),
+        ((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (18 << 1), 0x82000000, "fdiv f3, f4, f5"),
+        ((63u32 << 26) | (3 << 21) | (5 << 21) | (5 << 11) | (25 << 1), 0x82000000, "fmul f3, f0, f5 (encoded)"),
+        ((63u32 << 26) | (3 << 21) | (4 << 16) | (40 << 1), 0x82000000, "fneg f3, f4"),
+        ((63u32 << 26) | (3 << 21) | (4 << 16) | (72 << 1), 0x82000000, "fmr f3, f4"),
+        // mtfsf — XFL form (Fix 1). FM at LSB bits 17-24 (PPC bits 7-14).
+        // Encoding: opcode 63 | FM<<17 | frB<<11 | XO=711<<1 | Rc.
+        ((63u32 << 26) | (0xFF << 17) | (5 << 11) | (711 << 1), 0x82000000, "mtfsf 0xFF, f5 (Rc=0)"),
+        ((63u32 << 26) | (0xFF << 17) | (5 << 11) | (711 << 1) | 1, 0x82000000, "mtfsf. 0xFF, f5 (Rc=1)"),
+    ];
+    assert_or_regen("base_mnemonics.json", cases);
+}
+
+#[test]
+fn extended_mnemonics() {
+    let cases: &[(u32, u32, &str)] = &[
+        // ori r0, r0, 0 → nop
+        (dform(24, 0, 0, 0), 0x82000000, "nop"),
+        // addi r3, r0, imm → li
+        (dform(14, 3, 0, 16), 0x82000000, "li r3, 16"),
+        (dform(14, 3, 0, -1), 0x82000000, "li r3, -1"),
+        // addi r3, r4, neg → subi
+        (dform(14, 3, 4, -16), 0x82000000, "subi r3, r4, 16"),
+        // addis r3, r0, imm → lis
+        (dform(15, 3, 0, 0x1234), 0x82000000, "lis r3, 0x1234"),
+        // addis r3, r4, neg → subis
+        (dform(15, 3, 4, -1), 0x82000000, "subis r3, r4, 0xFFFF"),
+        // or rA, rS, rS → mr
+        (xform_logic(4, 3, 4, 444, 0), 0x82000000, "mr r3, r4"),
+        (xform_logic(4, 3, 4, 444, 1), 0x82000000, "mr. r3, r4"),
+        // and rA, rS, rS → mr (also)
+        (xform_logic(4, 3, 4, 28, 0), 0x82000000, "mr (via and)"),
+        // nor rA, rS, rS → not
+        (xform_logic(4, 3, 4, 124, 0), 0x82000000, "not r3, r4"),
+        // subf → sub (operand swap)
+        (xform_xo3(3, 4, 5, 0, 40, 0), 0x82000000, "subf → sub r3, r5, r4"),
+        // rlwinm simplifications
+        (rlwinm(4, 3, 4, 0, 31 - 4, 0), 0x82000000, "slwi r3, r4, 4"),
+        (rlwinm(4, 3, 32 - 4, 4, 31, 0), 0x82000000, "srwi r3, r4, 4"),
+        (rlwinm(4, 3, 8, 0, 31, 0), 0x82000000, "rotlwi r3, r4, 8"),
+        (rlwinm(4, 3, 0, 4, 31, 0), 0x82000000, "clrlwi r3, r4, 4"),
+        (rlwinm(4, 3, 0, 0, 27, 0), 0x82000000, "clrrwi r3, r4, 4"),
+        (rlwinm(4, 3, 8, 0, 7, 0), 0x82000000, "extlwi r3, r4, 8, 8"),
+        // rlwinm with Rc
+        (rlwinm(4, 3, 4, 0, 31 - 4, 1), 0x82000000, "slwi. r3, r4, 4"),
+        // rlwinm Sylpheed regression
+        (rlwinm(11, 11, 0, 31, 31, 1), 0x82000000, "rlwinm. r11,r11,0,31,31 (no simplify)"),
+        // rldicl simplifications
+        (rldicl(4, 3, 0, 32, 0), 0x82000000, "clrldi r3, r4, 32"),
+        (rldicl(4, 3, 64u32 - 8, 8, 0), 0x82000000, "srdi r3, r4, 8"),
+        (rldicl(4, 3, 8, 0, 0), 0x82000000, "rotldi r3, r4, 8"),
+        // cmpi / cmpli → cmpwi/cmpdi/cmplwi/cmpldi
+        (dform(11, 0, 3, 16), 0x82000000, "cmpwi cr0, r3, 16"),
+        (dform(11, (1 << 21) | (2 << 23), 3, 16) | (1 << 21), 0x82000000, "cmpdi (L=1) variant"),
+        // bclr 20, 0 → blr
+        (xlform_bclr(20, 0, 0), 0x82000000, "blr"),
+        (xlform_bclr(20, 0, 1), 0x82000000, "blrl"),
+        // bcctr 20, 0 → bctr
+        (xlform_bcctr(20, 0, 0), 0x82000000, "bctr"),
+        (xlform_bcctr(20, 0, 1), 0x82000000, "bctrl"),
+        // bclr conditional
+        (xlform_bclr(12, 2, 0), 0x82000000, "beqlr (BO=12, BI=2 → cr0.eq true)"),
+        (xlform_bclr(4, 2, 0), 0x82000000, "bnelr"),
+        // bc with full BO/BI: branch always (BO=20)
+        (bform_bc(20, 0, 0x40, 0, 0), 0x82000000, "bc → b 0x82000040"),
+        (bform_bc(20, 0, 0x40, 0, 1), 0x82000000, "bc l → bl 0x82000040"),
+        // Conditional bc → beq/bne/etc
+        (bform_bc(12, 2, 0x40, 0, 0), 0x82000000, "bc 12,cr0.eq → beq 0x82000040"),
+        (bform_bc(4, 2, 0x40, 0, 0), 0x82000000, "bc 4,cr0.eq → bne 0x82000040"),
+        (bform_bc(12, 0, 0x40, 0, 0), 0x82000000, "bc 12,cr0.lt → blt 0x82000040"),
+        (bform_bc(4, 0, 0x40, 0, 0), 0x82000000, "bc 4,cr0.lt → bge 0x82000040"),
+        (bform_bc(12, 1, 0x40, 0, 0), 0x82000000, "bc 12,cr0.gt → bgt 0x82000040"),
+        (bform_bc(4, 1, 0x40, 0, 0), 0x82000000, "bc 4,cr0.gt → ble 0x82000040"),
+        // Conditional with non-zero CR field
+        (bform_bc(12, 2 + 8, 0x40, 0, 0), 0x82000000, "bc 12, cr2.eq → beq cr2, 0x...040"),
+        // bdnz / bdz (decrement-CTR branches)
+        (bform_bc(16, 0, 0x40, 0, 0), 0x82000000, "bdnz 0x82000040"),
+        (bform_bc(18, 0, 0x40, 0, 0), 0x82000000, "bdz 0x82000040"),
+        // I-form branches
+        (iform_b(0x40, 0, 0), 0x82000000, "b +0x40 → 0x82000040"),
+        (iform_b(0x40, 0, 1), 0x82000000, "bl +0x40 → 0x82000040"),
+        (iform_b(0x40, 1, 0), 0x82000000, "ba 0x40 absolute"),
+        (iform_b(0x40, 1, 1), 0x82000000, "bla 0x40 absolute"),
+        // Trap immediate simplifications
+        ((2u32 << 26) | (4 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "tdeqi r3, 123"),
+        ((3u32 << 26) | (16 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "twlti r3, 123"),
+        // mfspr → mflr / mfctr / mfxer
+        (mfspr(3, 8), 0x82000000, "mflr r3"),
+        (mfspr(3, 9), 0x82000000, "mfctr r3"),
+        (mfspr(3, 1), 0x82000000, "mfxer r3"),
+        // mtspr → mtlr / mtctr / mtxer
+        (mtspr(3, 8), 0x82000000, "mtlr r3"),
+        (mtspr(3, 9), 0x82000000, "mtctr r3"),
+        (mtspr(3, 1), 0x82000000, "mtxer r3"),
+        // crnor with same source bits → crnot
+        ((19 << 26) | (4 << 21) | (5 << 16) | (5 << 11) | (33 << 1), 0x82000000, "crnot 4, 5"),
+        // crxor with all same → crclr
+        ((19 << 26) | (4 << 21) | (4 << 16) | (4 << 11) | (193 << 1), 0x82000000, "crclr 4"),
+        // creqv with all same → crset
+        ((19 << 26) | (4 << 21) | (4 << 16) | (4 << 11) | (289 << 1), 0x82000000, "crset 4"),
+        // cror with same source bits → crmove
+        ((19 << 26) | (4 << 21) | (5 << 16) | (5 << 11) | (449 << 1), 0x82000000, "crmove 4, 5"),
+        // sync L=1 → lwsync
+        ((31 << 26) | (1 << 21) | (598 << 1), 0x82000000, "lwsync"),
+        // tw 31, 0, 0 → trap
+        ((31 << 26) | (31 << 21) | (4 << 1), 0x82000000, "trap"),
+        // Fix 2: bclr/bcctr with BO=20 and BI≠0 still emits blr/bctr ext.
+        // BO=20 ignores both CTR test and CR test, so BI is don't-care.
+        (xlform_bclr(20, 4, 0), 0x82000000, "blr (BO=20, BI=4 — BI is don't-care)"),
+        (xlform_bclr(20, 7, 1), 0x82000000, "blrl (BO=20, BI=7)"),
+        (xlform_bcctr(20, 4, 0), 0x82000000, "bctr (BO=20, BI=4)"),
+        // Fix 3: trap unsigned simplified mnemonics (TO=1, 2, 5, 6 — logical
+        // compare conditions). Register form (tw/td) and immediate (twi/tdi).
+        ((31u32 << 26) | (2 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twllt r3, r4 (TO=2)"),
+        ((31u32 << 26) | (1 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twlgt r3, r4 (TO=1)"),
+        ((31u32 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (68 << 1), 0x82000000, "tdlge r3, r4 (TO=5)"),
+        ((31u32 << 26) | (6 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twlle r3, r4 (TO=6)"),
+        ((3u32 << 26) | (2 << 21) | (3 << 16) | (16u32 & 0xFFFF), 0x82000000, "twllti r3, 16"),
+        ((2u32 << 26) | (5 << 21) | (3 << 16) | (16u32 & 0xFFFF), 0x82000000, "tdlgei r3, 16"),
+    ];
+    assert_or_regen("extended_mnemonics.json", cases);
+}
+
+#[test]
+fn vmx128_registers() {
+    // Standard VMX (op=4) — 5-bit registers v0..v31. Verifies that the
+    // low-register path renders correctly through the new formatter.
+    let std_vmx = [
+        // vaddubm v3, v4, v5  : op=4, 3-op key=0
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 0, 0x82000000, "vaddubm v3, v4, v5"),
+        // vaddfp v3, v4, v5   : op=4, vx=10
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 10, 0x82000000, "vaddfp v3, v4, v5"),
+        // vand v3, v4, v5     : vx=1028
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1028, 0x82000000, "vand v3, v4, v5"),
+        // vor v3, v4, v5      : vx=1156
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1156, 0x82000000, "vor v3, v4, v5"),
+        // vxor v3, v4, v5     : vx=1220
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1220, 0x82000000, "vxor v3, v4, v5"),
+        // vsel v3, v4, v5, v6 : op=4, va_key=42 (4-op)
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 42, 0x82000000, "vsel v3,v4,v5,v6"),
+        // vperm v3, v4, v5, v6 : va_key=43
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 43, 0x82000000, "vperm v3,v4,v5,v6"),
+        // vmaddfp v3, v4, v5, v6 : va_key=46 (operand swap: vd, va, vc, vb)
+        ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 46, 0x82000000, "vmaddfp v3, v4, v6, v5 (swap)"),
+        // mfvscr v3            : vx=1540
+        ((4u32 << 26) | (3 << 21) | 1540, 0x82000000, "mfvscr v3"),
+        // mtvscr v5            : vx=1604, vb=v5
+        ((4u32 << 26) | (5 << 11) | 1604, 0x82000000, "mtvscr v5"),
+    ];
+
+    // VMX128 op=5 — uses vd128/va128/vb128 (7-bit registers, high bits at
+    // 21+22). These are the silent-bug-area encodings; we exercise low
+    // register indices here because the secondary-opcode key for op=5
+    // includes bits 21-22, constraining vd128 high bits to 0 in this form.
+    // High-index examples for vd128 live in the op=6 series below.
+    let vmx128_op5 = [
+        // vaddfp128 v3, v4, v5   : op=5, key2=0b000001
+        ((5u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (0 << 6) | (1 << 0), 0x82000000, "vaddfp128 (encoded sloppily)"),
+    ];
+
+    // VMX128 op=6 — vrlimi128 has secondary key in bits 23-25 + 26-27, so
+    // bits 21-22 ARE the high bits of vd128 (canonical silent-bug-area).
+    // These instructions exercise vd128 = 32, 64, 96 — covering the bit-21
+    // and bit-22 split that ppc.rs's old extractor (now deleted) miscoded.
+    let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 {
+        // op=6, vd128 = bits 6-10 + bit 21 + bit 22, vb128 = bits 16-20 + bits 30+31,
+        //  IMM = bits 11-15, Z = bits 24-25, key2 = (bits 23-25 << 4) | bits 26-27 = 0b1110001
+        let vd_lo = vd & 0x1F;
+        let vd_b21 = (vd >> 5) & 1;
+        let vd_b22 = (vd >> 6) & 1;
+        let vb_lo = vb & 0x1F;
+        let vb_b30 = (vb >> 5) & 1;
+        let vb_b31 = (vb >> 6) & 1;
+        // bits 23-25 = 111, bits 26-27 = 00, bit 27 = 1 → key2 lower 4 bits = 0001
+        // Encoded: bits 23-25 = 111, bits 26-27 = 00 are actually overlapping with z field (bits 24-25)
+        // The plan view: (bits 23 << 6) | (bits 24-25 << 4) | (bits 26-27 << 2) but the table uses different.
+        // Easiest: hand-encode known bit pattern matching decoder.rs's match:
+        //   key2 = (extract_bits(code, 23, 25) << 4) | extract_bits(code, 26, 27) = 0b1110001
+        //   bits 23-25 = 111, bits 26-27 = 01
+        // Bit positions 23-27 = 11101 (5 bits, MSB at 23).
+        //   PPC bit 23 (LSB index 8): set
+        //   PPC bit 24 (LSB index 7): set  -- this is z bit 0
+        //   PPC bit 25 (LSB index 6): set  -- this is z bit 1
+        //   PPC bit 26 (LSB index 5): unset
+        //   PPC bit 27 (LSB index 4): set
+        // We let z = bits 24-25 stored with vd128 bits at 21-22.
+        // To preserve key2 = 0b1110001, we need bits 24-25 = 11, bit 26 = 0, bit 27 = 1.
+        // BUT bits 24-25 ARE the z field; if we set them = 11 the z value is 3.
+        // So Z is constrained for vrlimi128. Choose Z = 3 (matches Sylpheed examples).
+        let z3 = z & 0x3;
+        (6u32 << 26)
+            | (vd_lo << 21)
+            | (imm << 16)
+            | (vb_lo << 11)
+            | (vd_b21 << 10)  // bit 21 (LSB pos 10)
+            | (vd_b22 << 9)   // bit 22 (LSB pos 9)
+            | (1 << 8)        // bit 23
+            | (z3 << 6)       // bits 24-25
+            | (0 << 5)        // bit 26
+            | (1 << 4)        // bit 27
+            | (vb_b30 << 1)   // bit 30
+            | vb_b31          // bit 31
+    };
+    // Note: VMX128 op6 secondary keys constrain bits 21-23. For
+    // vrlimi128 (key2 = 0b1110001 over bits 21-23 + 26-27) the only
+    // valid vd128 range is 96..=127 — lower values change the secondary
+    // key into some other instruction. The cases below record what the
+    // disassembler emits for the borderline encodings, so a regression
+    // in either the lookup table or the formatter would surface here.
+    let vmx128_high = [
+        // bits 21-22 = 00 → key2 ≠ vrlimi128 → decodes to vsrw128 (key5
+        // branch). Locks current behavior; shows the silent-bug-area
+        // encoding constraint.
+        (vrlimi128(0, 12, 4, 3), 0x82000000, "encoding vd_hi=00: actually vsrw128"),
+        // bits 21-22 = 10 → still not vrlimi128.
+        (vrlimi128(32, 12, 4, 3), 0x82000000, "encoding vd_hi=10: actually vsrw128 v32"),
+        // bits 21-22 = 01 → key1 matches vpermwi128.
+        (vrlimi128(64, 12, 4, 3), 0x82000000, "encoding vd_hi=01: actually vpermwi128"),
+        // bits 21-22 = 11 → key2 matches vrlimi128 with vd128=96.
+        (vrlimi128(96, 12, 4, 3), 0x82000000, "vrlimi128 v96, v12, 4, 3 (real)"),
+        (vrlimi128(127, 127, 4, 3), 0x82000000, "vrlimi128 v127, v127, 4, 3 (real)"),
+    ];
+
+    // Fix 4: VMX128 multiply-add 4-operand layouts. Per canary, the addend
+    // is the VD register re-used; operand order differs between the three
+    // mnemonics. Encodings hand-built to satisfy decode_op5's key2 secondary
+    // opcode (vmaddfp128=0b001101, vmaddcfp128=0b010001, vnmsubfp128=0b010101)
+    // with bit 22=0 (forced by key2's high nibble) so vd128 high bit 1 = 0.
+    // vd128 low = 3 (bits 6-10); va128 = 3 | (bit29<<5) = 35; vb128 = 5.
+    // Distinct VD vs VA verifies the layout isn't trivially aliasing VD.
+    //
+    //   layout (canary):
+    //     vmaddfp128   VD, VA, VB, VD  → "v3, v35, v5, v3"
+    //     vmaddcfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
+    //     vnmsubfp128  VD, VA, VD, VB  → "v3, v35, v3, v5"
+    let vmx128_4op = [
+        // vmaddfp128: bits 24=1, 25=1, 27=1, bit 29=1 (VA high), VB=5
+        (0x146028D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"),
+        // vmaddcfp128: bits 23=1, 27=1, bit 29=1, VB=5
+        (0x14602914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"),
+        // vnmsubfp128: bits 23=1, 25=1, 27=1, bit 29=1, VB=5
+        (0x14602954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"),
+    ];
+
+    let mut all = Vec::new();
+    all.extend_from_slice(&std_vmx);
+    all.extend_from_slice(&vmx128_op5);
+    all.extend_from_slice(&vmx128_high);
+    all.extend_from_slice(&vmx128_4op);
+    assert_or_regen("vmx128_registers.json", &all);
+}
diff --git a/crates/xenia-cpu/tests/golden/base_mnemonics.json b/crates/xenia-cpu/tests/golden/base_mnemonics.json
new file mode 100644
index 0000000..b0de088
--- /dev/null
+++ b/crates/xenia-cpu/tests/golden/base_mnemonics.json
@@ -0,0 +1,571 @@
+{
+  "rows": [
+    {
+      "label": "add r3,r4,r5",
+      "raw": "0x7C642A14",
+      "addr": "0x82000000",
+      "mnemonic": "add",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "add. r3,r4,r5",
+      "raw": "0x7C642A15",
+      "addr": "0x82000000",
+      "mnemonic": "add.",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "addo r3,r4,r5",
+      "raw": "0x7C642E14",
+      "addr": "0x82000000",
+      "mnemonic": "addo",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "addo. r3,r4,r5",
+      "raw": "0x7C642E15",
+      "addr": "0x82000000",
+      "mnemonic": "addo.",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "neg r3,r4",
+      "raw": "0x7C6400D0",
+      "addr": "0x82000000",
+      "mnemonic": "neg",
+      "operands": "r3, r4"
+    },
+    {
+      "label": "mullw r3,r4,r5",
+      "raw": "0x7C6429D6",
+      "addr": "0x82000000",
+      "mnemonic": "mullw",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "divw r3,r4,r5",
+      "raw": "0x7C642BD6",
+      "addr": "0x82000000",
+      "mnemonic": "divw",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "mulhw. r3,r4,r5",
+      "raw": "0x7C642897",
+      "addr": "0x82000000",
+      "mnemonic": "mulhw.",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "mulhwu. r3,r4,r5",
+      "raw": "0x7C642817",
+      "addr": "0x82000000",
+      "mnemonic": "mulhwu.",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "mulld r3,r4,r5",
+      "raw": "0x7C6429D2",
+      "addr": "0x82000000",
+      "mnemonic": "mulld",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "and r3,r4,r5",
+      "raw": "0x7C832838",
+      "addr": "0x82000000",
+      "mnemonic": "and",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "or r3,r4,r5 (non-mr: rs!=rb)",
+      "raw": "0x7C832B78",
+      "addr": "0x82000000",
+      "mnemonic": "or",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "xor r3,r4,r5",
+      "raw": "0x7C832A78",
+      "addr": "0x82000000",
+      "mnemonic": "xor",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "nor r3,r4,r5",
+      "raw": "0x7C8328F8",
+      "addr": "0x82000000",
+      "mnemonic": "nor",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "nand r3,r4,r5",
+      "raw": "0x7C832BB8",
+      "addr": "0x82000000",
+      "mnemonic": "nand",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "eqv r3,r4,r5",
+      "raw": "0x7C832A38",
+      "addr": "0x82000000",
+      "mnemonic": "eqv",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "andc r3,r4,r5",
+      "raw": "0x7C832878",
+      "addr": "0x82000000",
+      "mnemonic": "andc",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "orc r3,r4,r5",
+      "raw": "0x7C832B38",
+      "addr": "0x82000000",
+      "mnemonic": "orc",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "slw r3,r4,r5",
+      "raw": "0x7C832830",
+      "addr": "0x82000000",
+      "mnemonic": "slw",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "srw r3,r4,r5",
+      "raw": "0x7C832C30",
+      "addr": "0x82000000",
+      "mnemonic": "srw",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "sraw r3,r4,r5",
+      "raw": "0x7C832E30",
+      "addr": "0x82000000",
+      "mnemonic": "sraw",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "sld r3,r4,r5",
+      "raw": "0x7C832836",
+      "addr": "0x82000000",
+      "mnemonic": "sld",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "srd r3,r4,r5",
+      "raw": "0x7C832C36",
+      "addr": "0x82000000",
+      "mnemonic": "srd",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "srawi r3,r4,16",
+      "raw": "0x7C838670",
+      "addr": "0x82000000",
+      "mnemonic": "srawi",
+      "operands": "r3, r4, 16"
+    },
+    {
+      "label": "stwcx. r3,r4,r5",
+      "raw": "0x7C64292D",
+      "addr": "0x82000000",
+      "mnemonic": "stwcx.",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "stdcx. r3,r4,r5",
+      "raw": "0x7C6429AD",
+      "addr": "0x82000000",
+      "mnemonic": "stdcx.",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "lwarx r3,r4,r5",
+      "raw": "0x7C642828",
+      "addr": "0x82000000",
+      "mnemonic": "lwarx",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "ldarx r3,r4,r5",
+      "raw": "0x7C6428A8",
+      "addr": "0x82000000",
+      "mnemonic": "ldarx",
+      "operands": "r3, r4, r5"
+    },
+    {
+      "label": "cmpwi cr0, r3, 16",
+      "raw": "0x2C030010",
+      "addr": "0x82000000",
+      "mnemonic": "cmpi",
+      "operands": "0, r3, 16",
+      "ext_mnemonic": "cmpwi",
+      "ext_operands": "r3, 16"
+    },
+    {
+      "label": "cmpwi cr2, r3, 16",
+      "raw": "0x2D030010",
+      "addr": "0x82000000",
+      "mnemonic": "cmpi",
+      "operands": "cr2, 0, r3, 16",
+      "ext_mnemonic": "cmpwi",
+      "ext_operands": "cr2, r3, 16"
+    },
+    {
+      "label": "cmplwi cr0, r3, 16",
+      "raw": "0x28030010",
+      "addr": "0x82000000",
+      "mnemonic": "cmpli",
+      "operands": "0, r3, 0x10",
+      "ext_mnemonic": "cmplwi",
+      "ext_operands": "r3, 0x10"
+    },
+    {
+      "label": "cmpw r3,r4 in cr0",
+      "raw": "0x7C032000",
+      "addr": "0x82000000",
+      "mnemonic": "cmp",
+      "operands": "0, r3, r4",
+      "ext_mnemonic": "cmpw",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "cmpd r3,r4",
+      "raw": "0x7C232000",
+      "addr": "0x82000000",
+      "mnemonic": "cmp",
+      "operands": "1, r3, r4",
+      "ext_mnemonic": "cmpd",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "cmplw r3,r4",
+      "raw": "0x7C032040",
+      "addr": "0x82000000",
+      "mnemonic": "cmpl",
+      "operands": "0, r3, r4",
+      "ext_mnemonic": "cmplw",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "addi r3, r1, 16",
+      "raw": "0x38610010",
+      "addr": "0x82000000",
+      "mnemonic": "addi",
+      "operands": "r3, r1, 16"
+    },
+    {
+      "label": "addis r3, r1, 0x100 (ra!=0)",
+      "raw": "0x3C610100",
+      "addr": "0x82000000",
+      "mnemonic": "addis",
+      "operands": "r3, r1, 0x100"
+    },
+    {
+      "label": "mulli r3, r4, 5",
+      "raw": "0x1C640005",
+      "addr": "0x82000000",
+      "mnemonic": "mulli",
+      "operands": "r3, r4, 5"
+    },
+    {
+      "label": "subfic r3, r4, 5",
+      "raw": "0x20640005",
+      "addr": "0x82000000",
+      "mnemonic": "subfic",
+      "operands": "r3, r4, 5"
+    },
+    {
+      "label": "addic r3, r4, 16",
+      "raw": "0x30640010",
+      "addr": "0x82000000",
+      "mnemonic": "addic",
+      "operands": "r3, r4, 16"
+    },
+    {
+      "label": "addic. r3, r4, 16",
+      "raw": "0x34640010",
+      "addr": "0x82000000",
+      "mnemonic": "addic.",
+      "operands": "r3, r4, 16"
+    },
+    {
+      "label": "ori r4, r3, 0x10 (non-nop)",
+      "raw": "0x60640010",
+      "addr": "0x82000000",
+      "mnemonic": "ori",
+      "operands": "r4, r3, 0x10"
+    },
+    {
+      "label": "oris r4, r3, 0x10",
+      "raw": "0x64640010",
+      "addr": "0x82000000",
+      "mnemonic": "oris",
+      "operands": "r4, r3, 0x10"
+    },
+    {
+      "label": "xori r4, r3, 0x10",
+      "raw": "0x68640010",
+      "addr": "0x82000000",
+      "mnemonic": "xori",
+      "operands": "r4, r3, 0x10"
+    },
+    {
+      "label": "andi. r4, r3, 0x10",
+      "raw": "0x70640010",
+      "addr": "0x82000000",
+      "mnemonic": "andi.",
+      "operands": "r4, r3, 0x10"
+    },
+    {
+      "label": "lwz r5, 0x20(r1)",
+      "raw": "0x80A10020",
+      "addr": "0x82000000",
+      "mnemonic": "lwz",
+      "operands": "r5, 32(r1)"
+    },
+    {
+      "label": "stw r5, 0x20(r1)",
+      "raw": "0x90A10020",
+      "addr": "0x82000000",
+      "mnemonic": "stw",
+      "operands": "r5, 32(r1)"
+    },
+    {
+      "label": "lbz r5, 0x20(r1)",
+      "raw": "0x88A10020",
+      "addr": "0x82000000",
+      "mnemonic": "lbz",
+      "operands": "r5, 32(r1)"
+    },
+    {
+      "label": "lhz r5, 0x20(r1)",
+      "raw": "0xA0A10020",
+      "addr": "0x82000000",
+      "mnemonic": "lhz",
+      "operands": "r5, 32(r1)"
+    },
+    {
+      "label": "lfs f5, 0x20(r1)",
+      "raw": "0xC0A10020",
+      "addr": "0x82000000",
+      "mnemonic": "lfs",
+      "operands": "f5, 32(r1)"
+    },
+    {
+      "label": "lfd f5, 0x20(r1)",
+      "raw": "0xC8A10020",
+      "addr": "0x82000000",
+      "mnemonic": "lfd",
+      "operands": "f5, 32(r1)"
+    },
+    {
+      "label": "stfd f5, 0x20(r1)",
+      "raw": "0xD8A10020",
+      "addr": "0x82000000",
+      "mnemonic": "stfd",
+      "operands": "f5, 32(r1)"
+    },
+    {
+      "label": "ld r5, 0x20(r1)",
+      "raw": "0xE8A10020",
+      "addr": "0x82000000",
+      "mnemonic": "ld",
+      "operands": "r5, 32(r1)"
+    },
+    {
+      "label": "std r5, 0x20(r1)",
+      "raw": "0xF8A10020",
+      "addr": "0x82000000",
+      "mnemonic": "std",
+      "operands": "r5, 32(r1)"
+    },
+    {
+      "label": "sync 0 (extends to sync)",
+      "raw": "0x7C0004AC",
+      "addr": "0x82000000",
+      "mnemonic": "sync",
+      "operands": ""
+    },
+    {
+      "label": "isync",
+      "raw": "0x4C00012C",
+      "addr": "0x82000000",
+      "mnemonic": "isync",
+      "operands": ""
+    },
+    {
+      "label": "eieio",
+      "raw": "0x7C0006AC",
+      "addr": "0x82000000",
+      "mnemonic": "eieio",
+      "operands": ""
+    },
+    {
+      "label": "dcbst r1, r2",
+      "raw": "0x7C01106C",
+      "addr": "0x82000000",
+      "mnemonic": "dcbst",
+      "operands": "r1, r2"
+    },
+    {
+      "label": "dcbf r1, r2",
+      "raw": "0x7C0110AC",
+      "addr": "0x82000000",
+      "mnemonic": "dcbf",
+      "operands": "r1, r2"
+    },
+    {
+      "label": "dcbt r1, r2",
+      "raw": "0x7C01122C",
+      "addr": "0x82000000",
+      "mnemonic": "dcbt",
+      "operands": "r1, r2"
+    },
+    {
+      "label": "dcbz r1, r2",
+      "raw": "0x7C0117EC",
+      "addr": "0x82000000",
+      "mnemonic": "dcbz",
+      "operands": "r1, r2"
+    },
+    {
+      "label": "dcbz128 r1, r2",
+      "raw": "0x7C2117EC",
+      "addr": "0x82000000",
+      "mnemonic": "dcbz128",
+      "operands": "r1, r2"
+    },
+    {
+      "label": "crnor 4,5,6 (no simplify)",
+      "raw": "0x4C853042",
+      "addr": "0x82000000",
+      "mnemonic": "crnor",
+      "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq"
+    },
+    {
+      "label": "crand 4,5,6",
+      "raw": "0x4C853202",
+      "addr": "0x82000000",
+      "mnemonic": "crand",
+      "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq"
+    },
+    {
+      "label": "cror 4,5,6 (no simplify)",
+      "raw": "0x4C853382",
+      "addr": "0x82000000",
+      "mnemonic": "cror",
+      "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq"
+    },
+    {
+      "label": "tw 11, r3, r4 (uncommon TO)",
+      "raw": "0x7D632008",
+      "addr": "0x82000000",
+      "mnemonic": "tw",
+      "operands": "11, r3, r4"
+    },
+    {
+      "label": "tdi 11, r3, 123",
+      "raw": "0x0963007B",
+      "addr": "0x82000000",
+      "mnemonic": "tdi",
+      "operands": "11, r3, 123"
+    },
+    {
+      "label": "mtcrf 0xFF, r5 → mtcr",
+      "raw": "0x7CAFF120",
+      "addr": "0x82000000",
+      "mnemonic": "mtcrf",
+      "operands": "0xFF, r5",
+      "ext_mnemonic": "mtcr",
+      "ext_operands": "r5"
+    },
+    {
+      "label": "mfcr r5",
+      "raw": "0x7CA00026",
+      "addr": "0x82000000",
+      "mnemonic": "mfcr",
+      "operands": "r5"
+    },
+    {
+      "label": "mfmsr r5",
+      "raw": "0x7CA000A6",
+      "addr": "0x82000000",
+      "mnemonic": "mfmsr",
+      "operands": "r5"
+    },
+    {
+      "label": "mtmsr r5",
+      "raw": "0x7CA00124",
+      "addr": "0x82000000",
+      "mnemonic": "mtmsr",
+      "operands": "r5"
+    },
+    {
+      "label": "mtmsrd r5",
+      "raw": "0x7CA00164",
+      "addr": "0x82000000",
+      "mnemonic": "mtmsrd",
+      "operands": "r5"
+    },
+    {
+      "label": "fadd f3, f4, f5",
+      "raw": "0xFC64282A",
+      "addr": "0x82000000",
+      "mnemonic": "fadd",
+      "operands": "f3, f4, f5"
+    },
+    {
+      "label": "fsub f3, f4, f5",
+      "raw": "0xFC642828",
+      "addr": "0x82000000",
+      "mnemonic": "fsub",
+      "operands": "f3, f4, f5"
+    },
+    {
+      "label": "fdiv f3, f4, f5",
+      "raw": "0xFC642824",
+      "addr": "0x82000000",
+      "mnemonic": "fdiv",
+      "operands": "f3, f4, f5"
+    },
+    {
+      "label": "fmul f3, f0, f5 (encoded)",
+      "raw": "0xFCE02832",
+      "addr": "0x82000000",
+      "mnemonic": "fmul",
+      "operands": "f7, f0, f0"
+    },
+    {
+      "label": "fneg f3, f4",
+      "raw": "0xFC640050",
+      "addr": "0x82000000",
+      "mnemonic": "fneg",
+      "operands": "f3, f0"
+    },
+    {
+      "label": "fmr f3, f4",
+      "raw": "0xFC640090",
+      "addr": "0x82000000",
+      "mnemonic": "fmr",
+      "operands": "f3, f0"
+    },
+    {
+      "label": "mtfsf 0xFF, f5 (Rc=0)",
+      "raw": "0xFDFE2D8E",
+      "addr": "0x82000000",
+      "mnemonic": "mtfsf",
+      "operands": "0xFF, f5"
+    },
+    {
+      "label": "mtfsf. 0xFF, f5 (Rc=1)",
+      "raw": "0xFDFE2D8F",
+      "addr": "0x82000000",
+      "mnemonic": "mtfsf.",
+      "operands": "0xFF, f5"
+    }
+  ]
+}
diff --git a/crates/xenia-cpu/tests/golden/extended_mnemonics.json b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
new file mode 100644
index 0000000..d869109
--- /dev/null
+++ b/crates/xenia-cpu/tests/golden/extended_mnemonics.json
@@ -0,0 +1,621 @@
+{
+  "rows": [
+    {
+      "label": "nop",
+      "raw": "0x60000000",
+      "addr": "0x82000000",
+      "mnemonic": "ori",
+      "operands": "r0, r0, 0x0",
+      "ext_mnemonic": "nop",
+      "ext_operands": ""
+    },
+    {
+      "label": "li r3, 16",
+      "raw": "0x38600010",
+      "addr": "0x82000000",
+      "mnemonic": "addi",
+      "operands": "r3, r0, 16",
+      "ext_mnemonic": "li",
+      "ext_operands": "r3, 16"
+    },
+    {
+      "label": "li r3, -1",
+      "raw": "0x3860FFFF",
+      "addr": "0x82000000",
+      "mnemonic": "addi",
+      "operands": "r3, r0, -1",
+      "ext_mnemonic": "li",
+      "ext_operands": "r3, -1"
+    },
+    {
+      "label": "subi r3, r4, 16",
+      "raw": "0x3864FFF0",
+      "addr": "0x82000000",
+      "mnemonic": "addi",
+      "operands": "r3, r4, -16",
+      "ext_mnemonic": "subi",
+      "ext_operands": "r3, r4, 16"
+    },
+    {
+      "label": "lis r3, 0x1234",
+      "raw": "0x3C601234",
+      "addr": "0x82000000",
+      "mnemonic": "addis",
+      "operands": "r3, r0, 0x1234",
+      "ext_mnemonic": "lis",
+      "ext_operands": "r3, 0x1234"
+    },
+    {
+      "label": "subis r3, r4, 0xFFFF",
+      "raw": "0x3C64FFFF",
+      "addr": "0x82000000",
+      "mnemonic": "addis",
+      "operands": "r3, r4, 0xFFFF",
+      "ext_mnemonic": "subis",
+      "ext_operands": "r3, r4, 0x1"
+    },
+    {
+      "label": "mr r3, r4",
+      "raw": "0x7C832378",
+      "addr": "0x82000000",
+      "mnemonic": "or",
+      "operands": "r3, r4, r4",
+      "ext_mnemonic": "mr",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "mr. r3, r4",
+      "raw": "0x7C832379",
+      "addr": "0x82000000",
+      "mnemonic": "or.",
+      "operands": "r3, r4, r4",
+      "ext_mnemonic": "mr.",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "mr (via and)",
+      "raw": "0x7C832038",
+      "addr": "0x82000000",
+      "mnemonic": "and",
+      "operands": "r3, r4, r4",
+      "ext_mnemonic": "mr",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "not r3, r4",
+      "raw": "0x7C8320F8",
+      "addr": "0x82000000",
+      "mnemonic": "nor",
+      "operands": "r3, r4, r4",
+      "ext_mnemonic": "not",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "subf → sub r3, r5, r4",
+      "raw": "0x7C642850",
+      "addr": "0x82000000",
+      "mnemonic": "subf",
+      "operands": "r3, r4, r5",
+      "ext_mnemonic": "sub",
+      "ext_operands": "r3, r5, r4"
+    },
+    {
+      "label": "slwi r3, r4, 4",
+      "raw": "0x54832036",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm",
+      "operands": "r3, r4, 4, 0, 27",
+      "ext_mnemonic": "slwi",
+      "ext_operands": "r3, r4, 4"
+    },
+    {
+      "label": "srwi r3, r4, 4",
+      "raw": "0x5483E13E",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm",
+      "operands": "r3, r4, 28, 4, 31",
+      "ext_mnemonic": "srwi",
+      "ext_operands": "r3, r4, 4"
+    },
+    {
+      "label": "rotlwi r3, r4, 8",
+      "raw": "0x5483403E",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm",
+      "operands": "r3, r4, 8, 0, 31",
+      "ext_mnemonic": "rotlwi",
+      "ext_operands": "r3, r4, 8"
+    },
+    {
+      "label": "clrlwi r3, r4, 4",
+      "raw": "0x5483013E",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm",
+      "operands": "r3, r4, 0, 4, 31",
+      "ext_mnemonic": "clrlwi",
+      "ext_operands": "r3, r4, 4"
+    },
+    {
+      "label": "clrrwi r3, r4, 4",
+      "raw": "0x54830036",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm",
+      "operands": "r3, r4, 0, 0, 27",
+      "ext_mnemonic": "clrrwi",
+      "ext_operands": "r3, r4, 4"
+    },
+    {
+      "label": "extlwi r3, r4, 8, 8",
+      "raw": "0x5483400E",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm",
+      "operands": "r3, r4, 8, 0, 7",
+      "ext_mnemonic": "extlwi",
+      "ext_operands": "r3, r4, 8, 8"
+    },
+    {
+      "label": "slwi. r3, r4, 4",
+      "raw": "0x54832037",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm.",
+      "operands": "r3, r4, 4, 0, 27",
+      "ext_mnemonic": "slwi.",
+      "ext_operands": "r3, r4, 4"
+    },
+    {
+      "label": "rlwinm. r11,r11,0,31,31 (no simplify)",
+      "raw": "0x556B07FF",
+      "addr": "0x82000000",
+      "mnemonic": "rlwinm.",
+      "operands": "r11, r11, 0, 31, 31",
+      "ext_mnemonic": "clrlwi.",
+      "ext_operands": "r11, r11, 31"
+    },
+    {
+      "label": "clrldi r3, r4, 32",
+      "raw": "0x78830020",
+      "addr": "0x82000000",
+      "mnemonic": "rldicl",
+      "operands": "r3, r4, 0, 32",
+      "ext_mnemonic": "clrldi",
+      "ext_operands": "r3, r4, 32"
+    },
+    {
+      "label": "srdi r3, r4, 8",
+      "raw": "0x7883E200",
+      "addr": "0x82000000",
+      "mnemonic": "rldicl",
+      "operands": "r3, r4, 56, 8",
+      "ext_mnemonic": "srdi",
+      "ext_operands": "r3, r4, 8"
+    },
+    {
+      "label": "rotldi r3, r4, 8",
+      "raw": "0x78832000",
+      "addr": "0x82000000",
+      "mnemonic": "rldicl",
+      "operands": "r3, r4, 8, 0",
+      "ext_mnemonic": "rotldi",
+      "ext_operands": "r3, r4, 8"
+    },
+    {
+      "label": "cmpwi cr0, r3, 16",
+      "raw": "0x2C030010",
+      "addr": "0x82000000",
+      "mnemonic": "cmpi",
+      "operands": "0, r3, 16",
+      "ext_mnemonic": "cmpwi",
+      "ext_operands": "r3, 16"
+    },
+    {
+      "label": "cmpdi (L=1) variant",
+      "raw": "0x2C230010",
+      "addr": "0x82000000",
+      "mnemonic": "cmpi",
+      "operands": "1, r3, 16",
+      "ext_mnemonic": "cmpdi",
+      "ext_operands": "r3, 16"
+    },
+    {
+      "label": "blr",
+      "raw": "0x4E800020",
+      "addr": "0x82000000",
+      "mnemonic": "bclr",
+      "operands": "20, lt",
+      "ext_mnemonic": "blr",
+      "ext_operands": ""
+    },
+    {
+      "label": "blrl",
+      "raw": "0x4E800021",
+      "addr": "0x82000000",
+      "mnemonic": "bclrl",
+      "operands": "20, lt",
+      "ext_mnemonic": "blrl",
+      "ext_operands": ""
+    },
+    {
+      "label": "bctr",
+      "raw": "0x4E800420",
+      "addr": "0x82000000",
+      "mnemonic": "bcctr",
+      "operands": "20, lt",
+      "ext_mnemonic": "bctr",
+      "ext_operands": ""
+    },
+    {
+      "label": "bctrl",
+      "raw": "0x4E800421",
+      "addr": "0x82000000",
+      "mnemonic": "bcctrl",
+      "operands": "20, lt",
+      "ext_mnemonic": "bctrl",
+      "ext_operands": ""
+    },
+    {
+      "label": "beqlr (BO=12, BI=2 → cr0.eq true)",
+      "raw": "0x4D820020",
+      "addr": "0x82000000",
+      "mnemonic": "bclr",
+      "operands": "12, eq",
+      "ext_mnemonic": "beqlr",
+      "ext_operands": ""
+    },
+    {
+      "label": "bnelr",
+      "raw": "0x4C820020",
+      "addr": "0x82000000",
+      "mnemonic": "bclr",
+      "operands": "4, eq",
+      "ext_mnemonic": "bnelr",
+      "ext_operands": ""
+    },
+    {
+      "label": "bc → b 0x82000040",
+      "raw": "0x42800040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "20, lt, 0x82000040",
+      "ext_mnemonic": "b",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc l → bl 0x82000040",
+      "raw": "0x42800041",
+      "addr": "0x82000000",
+      "mnemonic": "bcl",
+      "operands": "20, lt, 0x82000040",
+      "ext_mnemonic": "bl",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc 12,cr0.eq → beq 0x82000040",
+      "raw": "0x41820040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "12, eq, 0x82000040",
+      "ext_mnemonic": "beq",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc 4,cr0.eq → bne 0x82000040",
+      "raw": "0x40820040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "4, eq, 0x82000040",
+      "ext_mnemonic": "bne",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc 12,cr0.lt → blt 0x82000040",
+      "raw": "0x41800040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "12, lt, 0x82000040",
+      "ext_mnemonic": "blt",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc 4,cr0.lt → bge 0x82000040",
+      "raw": "0x40800040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "4, lt, 0x82000040",
+      "ext_mnemonic": "bge",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc 12,cr0.gt → bgt 0x82000040",
+      "raw": "0x41810040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "12, gt, 0x82000040",
+      "ext_mnemonic": "bgt",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc 4,cr0.gt → ble 0x82000040",
+      "raw": "0x40810040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "4, gt, 0x82000040",
+      "ext_mnemonic": "ble",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bc 12, cr2.eq → beq cr2, 0x...040",
+      "raw": "0x418A0040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "12, 4*cr2+eq, 0x82000040",
+      "ext_mnemonic": "beq",
+      "ext_operands": "cr2, 0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bdnz 0x82000040",
+      "raw": "0x42000040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "16, lt, 0x82000040",
+      "ext_mnemonic": "bdnzge",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bdz 0x82000040",
+      "raw": "0x42400040",
+      "addr": "0x82000000",
+      "mnemonic": "bc",
+      "operands": "18, lt, 0x82000040",
+      "ext_mnemonic": "bdzge",
+      "ext_operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "b +0x40 → 0x82000040",
+      "raw": "0x48000040",
+      "addr": "0x82000000",
+      "mnemonic": "b",
+      "operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "bl +0x40 → 0x82000040",
+      "raw": "0x48000041",
+      "addr": "0x82000000",
+      "mnemonic": "bl",
+      "operands": "0x82000040",
+      "branch_target": "0x82000040"
+    },
+    {
+      "label": "ba 0x40 absolute",
+      "raw": "0x48000042",
+      "addr": "0x82000000",
+      "mnemonic": "ba",
+      "operands": "0x00000040",
+      "branch_target": "0x00000040"
+    },
+    {
+      "label": "bla 0x40 absolute",
+      "raw": "0x48000043",
+      "addr": "0x82000000",
+      "mnemonic": "bla",
+      "operands": "0x00000040",
+      "branch_target": "0x00000040"
+    },
+    {
+      "label": "tdeqi r3, 123",
+      "raw": "0x0883007B",
+      "addr": "0x82000000",
+      "mnemonic": "tdi",
+      "operands": "4, r3, 123",
+      "ext_mnemonic": "tdeqi",
+      "ext_operands": "r3, 123"
+    },
+    {
+      "label": "twlti r3, 123",
+      "raw": "0x0E03007B",
+      "addr": "0x82000000",
+      "mnemonic": "twi",
+      "operands": "16, r3, 123",
+      "ext_mnemonic": "twlti",
+      "ext_operands": "r3, 123"
+    },
+    {
+      "label": "mflr r3",
+      "raw": "0x7C6802A6",
+      "addr": "0x82000000",
+      "mnemonic": "mfspr",
+      "operands": "r3, LR",
+      "ext_mnemonic": "mflr",
+      "ext_operands": "r3"
+    },
+    {
+      "label": "mfctr r3",
+      "raw": "0x7C6902A6",
+      "addr": "0x82000000",
+      "mnemonic": "mfspr",
+      "operands": "r3, CTR",
+      "ext_mnemonic": "mfctr",
+      "ext_operands": "r3"
+    },
+    {
+      "label": "mfxer r3",
+      "raw": "0x7C6102A6",
+      "addr": "0x82000000",
+      "mnemonic": "mfspr",
+      "operands": "r3, XER",
+      "ext_mnemonic": "mfxer",
+      "ext_operands": "r3"
+    },
+    {
+      "label": "mtlr r3",
+      "raw": "0x7C6803A6",
+      "addr": "0x82000000",
+      "mnemonic": "mtspr",
+      "operands": "LR, r3",
+      "ext_mnemonic": "mtlr",
+      "ext_operands": "r3"
+    },
+    {
+      "label": "mtctr r3",
+      "raw": "0x7C6903A6",
+      "addr": "0x82000000",
+      "mnemonic": "mtspr",
+      "operands": "CTR, r3",
+      "ext_mnemonic": "mtctr",
+      "ext_operands": "r3"
+    },
+    {
+      "label": "mtxer r3",
+      "raw": "0x7C6103A6",
+      "addr": "0x82000000",
+      "mnemonic": "mtspr",
+      "operands": "XER, r3",
+      "ext_mnemonic": "mtxer",
+      "ext_operands": "r3"
+    },
+    {
+      "label": "crnot 4, 5",
+      "raw": "0x4C852842",
+      "addr": "0x82000000",
+      "mnemonic": "crnor",
+      "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+gt",
+      "ext_mnemonic": "crnot",
+      "ext_operands": "4*cr1+lt, 4*cr1+gt"
+    },
+    {
+      "label": "crclr 4",
+      "raw": "0x4C842182",
+      "addr": "0x82000000",
+      "mnemonic": "crxor",
+      "operands": "4*cr1+lt, 4*cr1+lt, 4*cr1+lt",
+      "ext_mnemonic": "crclr",
+      "ext_operands": "4*cr1+lt"
+    },
+    {
+      "label": "crset 4",
+      "raw": "0x4C842242",
+      "addr": "0x82000000",
+      "mnemonic": "creqv",
+      "operands": "4*cr1+lt, 4*cr1+lt, 4*cr1+lt",
+      "ext_mnemonic": "crset",
+      "ext_operands": "4*cr1+lt"
+    },
+    {
+      "label": "crmove 4, 5",
+      "raw": "0x4C852B82",
+      "addr": "0x82000000",
+      "mnemonic": "cror",
+      "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+gt",
+      "ext_mnemonic": "crmove",
+      "ext_operands": "4*cr1+lt, 4*cr1+gt"
+    },
+    {
+      "label": "lwsync",
+      "raw": "0x7C2004AC",
+      "addr": "0x82000000",
+      "mnemonic": "sync",
+      "operands": ""
+    },
+    {
+      "label": "trap",
+      "raw": "0x7FE00008",
+      "addr": "0x82000000",
+      "mnemonic": "tw",
+      "operands": "31, r0, r0",
+      "ext_mnemonic": "trap",
+      "ext_operands": ""
+    },
+    {
+      "label": "blr (BO=20, BI=4 — BI is don't-care)",
+      "raw": "0x4E840020",
+      "addr": "0x82000000",
+      "mnemonic": "bclr",
+      "operands": "20, 4*cr1+lt",
+      "ext_mnemonic": "blr",
+      "ext_operands": ""
+    },
+    {
+      "label": "blrl (BO=20, BI=7)",
+      "raw": "0x4E870021",
+      "addr": "0x82000000",
+      "mnemonic": "bclrl",
+      "operands": "20, 4*cr1+so",
+      "ext_mnemonic": "blrl",
+      "ext_operands": ""
+    },
+    {
+      "label": "bctr (BO=20, BI=4)",
+      "raw": "0x4E840420",
+      "addr": "0x82000000",
+      "mnemonic": "bcctr",
+      "operands": "20, 4*cr1+lt",
+      "ext_mnemonic": "bctr",
+      "ext_operands": ""
+    },
+    {
+      "label": "twllt r3, r4 (TO=2)",
+      "raw": "0x7C432008",
+      "addr": "0x82000000",
+      "mnemonic": "tw",
+      "operands": "2, r3, r4",
+      "ext_mnemonic": "twllt",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "twlgt r3, r4 (TO=1)",
+      "raw": "0x7C232008",
+      "addr": "0x82000000",
+      "mnemonic": "tw",
+      "operands": "1, r3, r4",
+      "ext_mnemonic": "twlgt",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "tdlge r3, r4 (TO=5)",
+      "raw": "0x7CA32088",
+      "addr": "0x82000000",
+      "mnemonic": "td",
+      "operands": "5, r3, r4",
+      "ext_mnemonic": "tdlge",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "twlle r3, r4 (TO=6)",
+      "raw": "0x7CC32008",
+      "addr": "0x82000000",
+      "mnemonic": "tw",
+      "operands": "6, r3, r4",
+      "ext_mnemonic": "twlle",
+      "ext_operands": "r3, r4"
+    },
+    {
+      "label": "twllti r3, 16",
+      "raw": "0x0C430010",
+      "addr": "0x82000000",
+      "mnemonic": "twi",
+      "operands": "2, r3, 16",
+      "ext_mnemonic": "twllti",
+      "ext_operands": "r3, 16"
+    },
+    {
+      "label": "tdlgei r3, 16",
+      "raw": "0x08A30010",
+      "addr": "0x82000000",
+      "mnemonic": "tdi",
+      "operands": "5, r3, 16",
+      "ext_mnemonic": "tdlgei",
+      "ext_operands": "r3, 16"
+    }
+  ]
+}
diff --git a/crates/xenia-cpu/tests/golden/vmx128_registers.json b/crates/xenia-cpu/tests/golden/vmx128_registers.json
new file mode 100644
index 0000000..1d072cd
--- /dev/null
+++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json
@@ -0,0 +1,137 @@
+{
+  "rows": [
+    {
+      "label": "vaddubm v3, v4, v5",
+      "raw": "0x10642800",
+      "addr": "0x82000000",
+      "mnemonic": "vaddubm",
+      "operands": "v3, v4, v5"
+    },
+    {
+      "label": "vaddfp v3, v4, v5",
+      "raw": "0x1064280A",
+      "addr": "0x82000000",
+      "mnemonic": "vaddfp",
+      "operands": "v3, v4, v5"
+    },
+    {
+      "label": "vand v3, v4, v5",
+      "raw": "0x10642C04",
+      "addr": "0x82000000",
+      "mnemonic": "vand",
+      "operands": "v3, v4, v5"
+    },
+    {
+      "label": "vor v3, v4, v5",
+      "raw": "0x10642C84",
+      "addr": "0x82000000",
+      "mnemonic": "vor",
+      "operands": "v3, v4, v5"
+    },
+    {
+      "label": "vxor v3, v4, v5",
+      "raw": "0x10642CC4",
+      "addr": "0x82000000",
+      "mnemonic": "vxor",
+      "operands": "v3, v4, v5"
+    },
+    {
+      "label": "vsel v3,v4,v5,v6",
+      "raw": "0x106429AA",
+      "addr": "0x82000000",
+      "mnemonic": "vsel",
+      "operands": "v3, v4, v5, v6"
+    },
+    {
+      "label": "vperm v3,v4,v5,v6",
+      "raw": "0x106429AB",
+      "addr": "0x82000000",
+      "mnemonic": "vperm",
+      "operands": "v3, v4, v5, v6"
+    },
+    {
+      "label": "vmaddfp v3, v4, v6, v5 (swap)",
+      "raw": "0x106429AE",
+      "addr": "0x82000000",
+      "mnemonic": "vmaddfp",
+      "operands": "v3, v4, v6, v5"
+    },
+    {
+      "label": "mfvscr v3",
+      "raw": "0x10600604",
+      "addr": "0x82000000",
+      "mnemonic": "mfvscr",
+      "operands": "v3"
+    },
+    {
+      "label": "mtvscr v5",
+      "raw": "0x10002E44",
+      "addr": "0x82000000",
+      "mnemonic": "mtvscr",
+      "operands": "v5"
+    },
+    {
+      "label": "vaddfp128 (encoded sloppily)",
+      "raw": "0x14642801",
+      "addr": "0x82000000",
+      "mnemonic": "vperm128",
+      "operands": "v3, v3, v5, 0"
+    },
+    {
+      "label": "encoding vd_hi=00: actually vsrw128",
+      "raw": "0x180461D0",
+      "addr": "0x82000000",
+      "mnemonic": "vsrw128",
+      "operands": "v0, v0, v12"
+    },
+    {
+      "label": "encoding vd_hi=10: actually vsrw128 v32",
+      "raw": "0x180465D0",
+      "addr": "0x82000000",
+      "mnemonic": "vsrw128",
+      "operands": "v32, v0, v12"
+    },
+    {
+      "label": "encoding vd_hi=01: actually vpermwi128",
+      "raw": "0x180463D0",
+      "addr": "0x82000000",
+      "mnemonic": "vpermwi128",
+      "operands": "v64, v12, 0xE4"
+    },
+    {
+      "label": "vrlimi128 v96, v12, 4, 3 (real)",
+      "raw": "0x180467D0",
+      "addr": "0x82000000",
+      "mnemonic": "vrlimi128",
+      "operands": "v96, v12, 4, 3"
+    },
+    {
+      "label": "vrlimi128 v127, v127, 4, 3 (real)",
+      "raw": "0x1BE4FFD3",
+      "addr": "0x82000000",
+      "mnemonic": "vrlimi128",
+      "operands": "v127, v95, 4, 3"
+    },
+    {
+      "label": "vmaddfp128 v3, v35, v5, v3",
+      "raw": "0x146028D4",
+      "addr": "0x82000000",
+      "mnemonic": "vmaddfp128",
+      "operands": "v3, v35, v5, v3"
+    },
+    {
+      "label": "vmaddcfp128 v3, v35, v3, v5",
+      "raw": "0x14602914",
+      "addr": "0x82000000",
+      "mnemonic": "vmaddcfp128",
+      "operands": "v3, v35, v3, v5"
+    },
+    {
+      "label": "vnmsubfp128 v3, v35, v3, v5",
+      "raw": "0x14602954",
+      "addr": "0x82000000",
+      "mnemonic": "vnmsubfp128",
+      "operands": "v3, v35, v3, v5"
+    }
+  ]
+}