From c36cca14f9317374795e3ccff9ca3042888acbef Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Fri, 1 May 2026 16:27:43 +0200 Subject: [PATCH] xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/xenia-cpu/Cargo.toml | 8 + crates/xenia-cpu/benches/interpreter.rs | 194 + crates/xenia-cpu/src/block_cache.rs | 423 ++ crates/xenia-cpu/src/context.rs | 99 +- crates/xenia-cpu/src/decoder.rs | 288 ++ crates/xenia-cpu/src/disasm.rs | 2071 ++++++++-- crates/xenia-cpu/src/fpscr.rs | 384 ++ crates/xenia-cpu/src/interpreter.rs | 3441 ++++++++++++++++- crates/xenia-cpu/src/lib.rs | 16 + crates/xenia-cpu/src/opcode.rs | 84 + crates/xenia-cpu/src/overflow.rs | 173 + crates/xenia-cpu/src/phaser.rs | 345 ++ crates/xenia-cpu/src/reservation.rs | 424 ++ crates/xenia-cpu/src/scheduler.rs | 1919 +++++++++ crates/xenia-cpu/src/trap.rs | 95 + crates/xenia-cpu/src/vmx.rs | 918 +++++ crates/xenia-cpu/tests/disasm_goldens.rs | 531 +++ .../tests/golden/base_mnemonics.json | 571 +++ .../tests/golden/extended_mnemonics.json | 621 +++ .../tests/golden/vmx128_registers.json | 137 + 20 files changed, 12284 insertions(+), 458 deletions(-) create mode 100644 crates/xenia-cpu/benches/interpreter.rs create mode 100644 crates/xenia-cpu/src/block_cache.rs create mode 100644 crates/xenia-cpu/src/fpscr.rs create mode 100644 crates/xenia-cpu/src/overflow.rs create mode 100644 crates/xenia-cpu/src/phaser.rs create mode 100644 crates/xenia-cpu/src/reservation.rs create mode 100644 crates/xenia-cpu/src/scheduler.rs create mode 100644 crates/xenia-cpu/src/trap.rs create mode 100644 crates/xenia-cpu/src/vmx.rs create mode 100644 crates/xenia-cpu/tests/disasm_goldens.rs create mode 100644 crates/xenia-cpu/tests/golden/base_mnemonics.json create mode 100644 crates/xenia-cpu/tests/golden/extended_mnemonics.json create mode 100644 crates/xenia-cpu/tests/golden/vmx128_registers.json diff --git a/crates/xenia-cpu/Cargo.toml b/crates/xenia-cpu/Cargo.toml index 3ca488b..63b9166 100644 --- a/crates/xenia-cpu/Cargo.toml +++ b/crates/xenia-cpu/Cargo.toml @@ -10,3 +10,11 @@ xenia-memory = { workspace = true } tracing = { workspace = true } bitflags = { workspace = true } thiserror = { workspace = true } + +[dev-dependencies] +serde = { workspace = true } +serde_json = { workspace = true } + +[[bench]] +name = "interpreter" +harness = false diff --git a/crates/xenia-cpu/benches/interpreter.rs b/crates/xenia-cpu/benches/interpreter.rs new file mode 100644 index 0000000..7efd251 --- /dev/null +++ b/crates/xenia-cpu/benches/interpreter.rs @@ -0,0 +1,194 @@ +//! Interpreter throughput micro-benchmarks. +//! +//! Custom `harness = false` main — no extra dev-deps. Run via +//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`). +//! +//! Three workloads, each measuring `step_cached` throughput in MIPS: +//! +//! - `tight_alu_loop` — pure dispatch + ALU + decode-cache hit. +//! - `loadstore_loop` — alternating `lwz`/`stw` against main RAM. Stresses +//! every load/store path and `find_mmio` dispatch. +//! - `mmio_storm` — same shape as `loadstore_loop` but the address is +//! in a registered MMIO aperture. Sanity-checks that +//! MMIO writes still dispatch correctly. +//! +//! These are not statistically rigorous — no warmup, no variance — they're +//! just enough to detect 2x-class wins or regressions on the perf-track +//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go +//! into commit messages; there is no automated baseline file. + +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::Instant; + +use xenia_cpu::context::PpcContext; +use xenia_cpu::decoder::DecodeCache; +use xenia_cpu::interpreter::{step_cached, StepResult}; +use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion}; +use xenia_memory::page_table::MemoryProtect; + +// PPC instruction encoders — minimal subset needed by the benches. + +#[inline] +fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 { + (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32) +} + +#[inline] +fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 { + (32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32) +} + +#[inline] +fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 { + (36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32) +} + +/// Set up a `GuestMemory` with one writable region for code+data. +fn make_mem(code_base: u32, code_size: u32) -> GuestMemory { + let mut mem = GuestMemory::new().expect("reserve 4GB"); + mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE) + .expect("alloc bench region"); + mem +} + +/// Write a sequence of raw PPC instructions starting at `base`. +fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) { + for (i, &raw) in instrs.iter().enumerate() { + mem.write_u32(base + (i as u32 * 4), raw); + } +} + +/// Run `total_instrs` interpreter steps over a program of length `n`, +/// wrapping PC back to `base` whenever it falls off the end. Returns the +/// elapsed wall time. +fn run_loop( + ctx: &mut PpcContext, + mem: &GuestMemory, + cache: &mut DecodeCache, + base: u32, + n: u32, + total_instrs: u64, +) -> std::time::Duration { + let end = base + n * 4; + ctx.pc = base; + let t0 = Instant::now(); + for _ in 0..total_instrs { + let pv = mem.page_version(ctx.pc); + let r = step_cached(ctx, mem, cache, pv); + debug_assert!(matches!(r, StepResult::Continue)); + if ctx.pc >= end { + ctx.pc = base; + } + } + t0.elapsed() +} + +fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) { + let secs = elapsed.as_secs_f64(); + let mips = (total_instrs as f64) / secs / 1.0e6; + println!( + "{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS", + label, total_instrs, secs, mips + ); +} + +fn bench_tight_alu_loop() { + const BASE: u32 = 0x1000; + const N: u32 = 256; + const TOTAL: u64 = 50_000_000; + + let mut mem = make_mem(BASE, 0x1000); + // 256 × `addi r3, r3, 1` — pure register-register, no memory touch + // beyond instruction fetch. + let prog: Vec = (0..N).map(|_| enc_addi(3, 3, 1)).collect(); + write_program(&mut mem, BASE, &prog); + + let mut ctx = PpcContext::new(); + let mut cache = DecodeCache::new(); + + let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL); + report("tight_alu_loop", TOTAL, elapsed); +} + +fn bench_loadstore_loop() { + const CODE_BASE: u32 = 0x1000; + const DATA_BASE: u32 = 0x2000; + const N: u32 = 256; + const TOTAL: u64 = 30_000_000; + + let mut mem = make_mem(CODE_BASE, 0x2000); + // 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every + // load/store path through `read_u32`/`write_u32` (incl. `find_mmio`). + let mut prog = Vec::with_capacity(N as usize); + for _ in 0..(N / 2) { + prog.push(enc_stw(3, 4, 0)); + prog.push(enc_lwz(5, 4, 0)); + } + write_program(&mut mem, CODE_BASE, &prog); + + let mut ctx = PpcContext::new(); + ctx.gpr[3] = 0xDEAD_BEEF; + ctx.gpr[4] = DATA_BASE as u64; + let mut cache = DecodeCache::new(); + + let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL); + report("loadstore_loop", TOTAL, elapsed); +} + +fn bench_mmio_storm() { + const CODE_BASE: u32 = 0x1000; + const MMIO_BASE: u32 = 0xEA00_0000; + const N: u32 = 64; + // MMIO is slower per access — keep total smaller so the bench stays + // under a few seconds. + const TOTAL: u64 = 2_000_000; + + let mut mem = make_mem(CODE_BASE, 0x1000); + + let writes = Arc::new(AtomicU64::new(0)); + let reads = Arc::new(AtomicU32::new(0)); + let writes_clone = writes.clone(); + let reads_clone = reads.clone(); + mem.add_mmio_region(MmioRegion { + base_address: MMIO_BASE, + mask: 0xFFFF_0000, + size: 0x0001_0000, + read_callback: Box::new(move |_a| { + reads_clone.fetch_add(1, Ordering::Relaxed); + 0 + }), + write_callback: Box::new(move |_a, _v| { + writes_clone.fetch_add(1, Ordering::Relaxed); + }), + }); + + let mut prog = Vec::with_capacity(N as usize); + for _ in 0..(N / 2) { + prog.push(enc_stw(3, 4, 0)); + prog.push(enc_lwz(5, 4, 0)); + } + write_program(&mut mem, CODE_BASE, &prog); + + let mut ctx = PpcContext::new(); + ctx.gpr[3] = 0x1234_5678; + ctx.gpr[4] = MMIO_BASE as u64; + let mut cache = DecodeCache::new(); + + let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL); + report("mmio_storm", TOTAL, elapsed); + + // Sanity assertions — silently catch a refactor that breaks MMIO dispatch. + let w = writes.load(Ordering::Relaxed); + let r = reads.load(Ordering::Relaxed); + assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched"); + assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched"); +} + +fn main() { + println!("xenia-cpu interpreter bench"); + println!(" build: {}", if cfg!(debug_assertions) { "debug" } else { "release" }); + bench_tight_alu_loop(); + bench_loadstore_loop(); + bench_mmio_storm(); +} diff --git a/crates/xenia-cpu/src/block_cache.rs b/crates/xenia-cpu/src/block_cache.rs new file mode 100644 index 0000000..c4dd5a5 --- /dev/null +++ b/crates/xenia-cpu/src/block_cache.rs @@ -0,0 +1,423 @@ +//! Tier-4 perf — basic-block cache for the PPC interpreter. +//! +//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction +//! per slot, indexed by PC. The hot loop still pays the per-instruction +//! cost of fetching the raw word, hashing the PC into a slot, and +//! comparing tags. For straight-line code — common in the asset/inflate +//! loops where Sylpheed boot is currently CPU-bound — the savings of +//! batching N decoded instructions per slot lookup are linear in block +//! length. +//! +//! ## Shape +//! +//! A `DecodedBlock` is a contiguous run of decoded instructions starting +//! at `start_pc`, ending at the first *block terminator* (any branch, +//! `sc`, trap, or `Invalid`) or at one of two safety limits: +//! +//! - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost. +//! - 4 KiB page boundary stop. A block is fully contained inside a +//! single 4 KiB guest page; that means `mem.page_version(start_pc)` +//! is sufficient to detect any code-page rewrite that should +//! invalidate the block. Without this rule the cache would have to +//! walk every spanned page on every hit, which would erase the win. +//! +//! ## Invalidation +//! +//! Each block stamps the page version at build time. On lookup, if +//! `mem.page_version(start_pc)` differs from `block.page_version`, the +//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at +//! block granularity. +//! +//! ## Debugger semantics +//! +//! Block dispatch is **opt-in** by the caller. The hot loop in +//! `xenia-app/src/main.rs` selects the per-instruction path whenever +//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set. +//! That's how single-step, breakpoints, in-memory trace, instruction +//! trace, and branch trace continue to observe every PC: the block +//! cache simply never runs in those modes. + +use crate::decoder::{decode, DecodedInstr}; +use xenia_memory::MemoryAccess; + +/// Direct-mapped block-cache slot count. Same shape as +/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the +/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the +/// slot collision rate is negligible. +const BLOCK_CACHE_SIZE: usize = 1 << 16; +const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32; + +/// Hard cap on instructions per block. Keeps the worst-case memory +/// footprint bounded and limits the rebuild cost when a code page +/// gets bumped. 32 instructions is generous for most basic blocks +/// (real-world average across Sylpheed boot is ~6 between branches). +pub const MAX_BLOCK_INSTRS: usize = 32; + +/// Guest page size — duplicated here to avoid pulling +/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync +/// with the memory crate. Both refer to the architectural PowerPC 4 KiB +/// page granule, so this constant is locked. +const GUEST_PAGE_SIZE: u32 = 4096; +const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1); + +/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock` +/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and +/// stays valid until the next `lookup_or_build` on the same slot. +#[derive(Debug)] +pub struct DecodedBlock { + /// Guest PC at which this block starts. Used as the slot tag. + pub start_pc: u32, + /// Guest PC immediately after the last instruction in `instrs`. + /// Equal to `instrs.last().addr + 4` whether or not the block + /// ended on a terminator. Useful for tracing / disassembly. + pub end_pc: u32, + /// `mem.page_version(start_pc)` at build time. Mismatch on lookup + /// invalidates the block. Single value because every block is + /// page-bounded by construction. + pub page_version: u64, + /// Decoded instructions in execution order. Always non-empty after + /// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk + /// pushes the first decoded word unconditionally). + pub instrs: Vec, +} + +/// Per-slot status from a `lookup_or_build` probe. Internal only. +enum CacheStatus { + /// Block at this slot matches `pc` and the page version at build + /// time matches `mem.page_version(pc)` — return as-is. + Hit, + /// Block at this slot matched `pc` but the page version has + /// advanced — rebuild and bump `invalidations`. + Stale, + /// Slot is empty or holds a block keyed at a different `start_pc`. + /// Build a fresh block and bump `misses`. + Miss, +} + +/// Direct-mapped block cache. One instance shared across all HW slots +/// (block contents are PC-only and read-only after fill). Not +/// thread-safe — owner is the single scheduler thread, same as +/// `DecodeCache`. +pub struct BlockCache { + slots: Box<[Option>]>, + hits: u64, + misses: u64, + invalidations: u64, +} + +impl Default for BlockCache { + fn default() -> Self { + Self::new() + } +} + +impl BlockCache { + pub fn new() -> Self { + // `Option>` is a niche-optimized 8-byte slot; 64 K of + // them cost ~512 KiB of cold storage. Live blocks beyond that + // sit on the heap. + let mut v: Vec>> = Vec::with_capacity(BLOCK_CACHE_SIZE); + v.resize_with(BLOCK_CACHE_SIZE, || None); + Self { + slots: v.into_boxed_slice(), + hits: 0, + misses: 0, + invalidations: 0, + } + } + + pub fn hits(&self) -> u64 { + self.hits + } + pub fn misses(&self) -> u64 { + self.misses + } + pub fn invalidations(&self) -> u64 { + self.invalidations + } + + /// Return the cached block starting at `pc`, building it if absent + /// or stale. The returned reference is borrowed from the cache and + /// stays valid until the next `lookup_or_build` call. + pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock { + let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize; + let cur_pv = mem.page_version(pc); + + // Phase 1: classify the slot. Borrow ends before fill so the + // mutable update below doesn't conflict. + let status = match &self.slots[idx] { + Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit, + Some(b) if b.start_pc == pc => CacheStatus::Stale, + _ => CacheStatus::Miss, + }; + + // Phase 2: fill on miss/stale, account. + match status { + CacheStatus::Hit => { + self.hits += 1; + } + CacheStatus::Stale => { + self.invalidations += 1; + self.misses += 1; + let block = build_block(pc, mem, cur_pv); + self.slots[idx] = Some(Box::new(block)); + } + CacheStatus::Miss => { + self.misses += 1; + let block = build_block(pc, mem, cur_pv); + self.slots[idx] = Some(Box::new(block)); + } + } + + // Slot is guaranteed populated at this point — Hit returned a + // pre-existing block, Miss/Stale just wrote a new one. + self.slots[idx] + .as_deref() + .expect("block freshly built or hit") + } +} + +/// Walk forward from `pc`, decoding instructions and collecting them +/// into a `DecodedBlock`. The walk stops on the first of: +/// - a [`PpcOpcode::terminates_block`] true (the terminator IS +/// included as the last instruction), +/// - reaching [`MAX_BLOCK_INSTRS`], +/// - the next PC would cross a 4 KiB guest page boundary. +fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock { + let mut instrs: Vec = Vec::with_capacity(8); + let page_base = start_pc & GUEST_PAGE_MASK; + let mut cur = start_pc; + + loop { + let raw = mem.read_u32(cur); + let decoded = decode(raw, cur); + let terminates = decoded.opcode.terminates_block(); + instrs.push(decoded); + + if terminates { + break; + } + if instrs.len() >= MAX_BLOCK_INSTRS { + break; + } + let next = cur.wrapping_add(4); + if (next & GUEST_PAGE_MASK) != page_base { + break; + } + cur = next; + } + + let last = instrs.last().expect("build pushes at least one instruction"); + let end_pc = last.addr.wrapping_add(4); + + DecodedBlock { + start_pc, + end_pc, + page_version, + instrs, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::opcode::PpcOpcode; + + use std::cell::Cell; + + /// 64 KiB byte-array memory, big-endian word reads. + /// Mirrors `interpreter::tests::TestMem` but lives here so block_cache + /// tests don't depend on interpreter internals. + struct BlockTestMem { + data: Box<[Cell]>, + version_a: u64, + version_b: u64, + // Address of the page whose version is `version_b` instead of + // `version_a`. Used to model an out-of-band page-version bump in + // the invalidation test without going through write_*. + bumped_page: Cell>, + } + + impl BlockTestMem { + fn new() -> Self { + Self { + data: (0..0x10000u32).map(|_| Cell::new(0)).collect(), + version_a: 1, + version_b: 2, + bumped_page: Cell::new(None), + } + } + fn put(&self, addr: u32, raw: u32) { + let a = addr as usize; + for (i, byte) in raw.to_be_bytes().iter().enumerate() { + self.data[a + i].set(*byte); + } + } + } + + impl MemoryAccess for BlockTestMem { + fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() } + fn read_u16(&self, a: u32) -> u16 { + let i = a as usize; + u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()]) + } + fn read_u32(&self, a: u32) -> u32 { + let i = a as usize; + u32::from_be_bytes([ + self.data[i].get(), self.data[i + 1].get(), + self.data[i + 2].get(), self.data[i + 3].get(), + ]) + } + fn read_u64(&self, a: u32) -> u64 { + let i = a as usize; + u64::from_be_bytes([ + self.data[i].get(), self.data[i + 1].get(), + self.data[i + 2].get(), self.data[i + 3].get(), + self.data[i + 4].get(), self.data[i + 5].get(), + self.data[i + 6].get(), self.data[i + 7].get(), + ]) + } + fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); } + fn write_u16(&self, a: u32, v: u16) { + let i = a as usize; + let b = v.to_be_bytes(); + self.data[i].set(b[0]); + self.data[i + 1].set(b[1]); + } + fn write_u32(&self, a: u32, v: u32) { + let i = a as usize; + for (k, byte) in v.to_be_bytes().iter().enumerate() { + self.data[i + k].set(*byte); + } + } + fn write_u64(&self, a: u32, v: u64) { + let i = a as usize; + for (k, byte) in v.to_be_bytes().iter().enumerate() { + self.data[i + k].set(*byte); + } + } + fn translate(&self, _: u32) -> Option<*const u8> { None } + fn translate_mut(&self, _: u32) -> Option<*mut u8> { None } + fn page_version(&self, addr: u32) -> u64 { + if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() { + self.version_b + } else { + self.version_a + } + } + } + + // PPC encodings — minimal subset for these tests. + fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 { + (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32) + } + fn enc_b_self() -> u32 { + // b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0. + 18 << 26 + } + fn enc_unimplemented() -> u32 { + // Use opcode 0 raw = 0; decoder maps to Invalid. + 0 + } + + #[test] + fn block_built_to_terminator() { + let mem = BlockTestMem::new(); + mem.put(0x100, enc_addi(3, 3, 1)); + mem.put(0x104, enc_addi(3, 3, 1)); + mem.put(0x108, enc_addi(3, 3, 1)); + mem.put(0x10C, enc_b_self()); // terminator + let mut bc = BlockCache::new(); + let b = bc.lookup_or_build(0x100, &mem); + assert_eq!(b.start_pc, 0x100); + assert_eq!(b.instrs.len(), 4); + assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx); + assert_eq!(b.end_pc, 0x110); + } + + #[test] + fn block_stops_at_page_boundary() { + // Build from 0x1FFC. The next PC (0x2000) is in a different + // 4 KiB page — block must contain only the one instruction. + let mem = BlockTestMem::new(); + mem.put(0x1FFC, enc_addi(3, 3, 1)); + mem.put(0x2000, enc_addi(3, 3, 1)); + let mut bc = BlockCache::new(); + let b = bc.lookup_or_build(0x1FFC, &mem); + assert_eq!(b.instrs.len(), 1); + assert_eq!(b.end_pc, 0x2000); + } + + #[test] + fn block_stops_at_max_len() { + // 64 consecutive non-terminator instructions on one page — + // block must clamp at MAX_BLOCK_INSTRS. + let mem = BlockTestMem::new(); + for i in 0..64u32 { + mem.put(0x100 + i * 4, enc_addi(3, 3, 1)); + } + let mut bc = BlockCache::new(); + let b = bc.lookup_or_build(0x100, &mem); + assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS); + assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4); + } + + #[test] + fn block_stops_at_invalid_opcode() { + // Decoder mapping `Invalid` is treated as a block terminator + // so the per-instruction Unimplemented path is preserved. + let mem = BlockTestMem::new(); + mem.put(0x100, enc_addi(3, 3, 1)); + mem.put(0x104, enc_unimplemented()); + mem.put(0x108, enc_addi(3, 3, 1)); + let mut bc = BlockCache::new(); + let b = bc.lookup_or_build(0x100, &mem); + assert_eq!(b.instrs.len(), 2); + assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid); + } + + #[test] + fn block_invalidates_on_page_version_bump() { + let mem = BlockTestMem::new(); + mem.put(0x100, enc_addi(3, 3, 1)); + mem.put(0x104, enc_b_self()); + let mut bc = BlockCache::new(); + let _ = bc.lookup_or_build(0x100, &mem); + assert_eq!(bc.misses(), 1); + assert_eq!(bc.hits(), 0); + + // Same call → hit. + let _ = bc.lookup_or_build(0x100, &mem); + assert_eq!(bc.hits(), 1); + assert_eq!(bc.invalidations(), 0); + + // Bump the page version on the page containing 0x100. Next + // lookup must invalidate and rebuild. + mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK)); + let _ = bc.lookup_or_build(0x100, &mem); + assert_eq!(bc.invalidations(), 1); + assert_eq!(bc.misses(), 2); + } + + #[test] + fn block_hit_returns_same_contents() { + // Sanity: cache hit returns a block whose contents reflect the + // ORIGINAL instruction stream, even after a non-version-bumping + // poke to the underlying bytes. (No real workload would do + // this, but it confirms we're returning cached data, not + // re-reading.) + let mem = BlockTestMem::new(); + mem.put(0x100, enc_addi(3, 3, 7)); + mem.put(0x104, enc_b_self()); + let mut bc = BlockCache::new(); + let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16(); + // Rewrite without bumping version (test-only path). + let bytes = enc_addi(3, 3, 99).to_be_bytes(); + for (i, b) in bytes.iter().enumerate() { + mem.data[0x100 + i].set(*b); + } + let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16(); + assert_eq!(first_simm, 7); + assert_eq!(cached_simm, 7, "cache must serve original decoded form"); + } +} diff --git a/crates/xenia-cpu/src/context.rs b/crates/xenia-cpu/src/context.rs index b500c50..26b9733 100644 --- a/crates/xenia-cpu/src/context.rs +++ b/crates/xenia-cpu/src/context.rs @@ -29,16 +29,37 @@ pub mod spr { pub const XER: u32 = 1; pub const LR: u32 = 8; pub const CTR: u32 = 9; - pub const TBL: u32 = 268; - pub const TBU: u32 = 269; + pub const DSISR: u32 = 18; + pub const DAR: u32 = 19; + /// Decrementer (hypervisor-visible, 32-bit down-counter). + pub const DEC: u32 = 22; + pub const TBL: u32 = 268; // Read (user) + pub const TBU: u32 = 269; // Read (user) + /// Time-base write (supervisor). Separate SPR number from TBL (268) for + /// access-control reasons. + pub const TBL_WRITE: u32 = 284; + pub const TBU_WRITE: u32 = 285; pub const SPRG0: u32 = 272; pub const SPRG1: u32 = 273; pub const SPRG2: u32 = 274; pub const SPRG3: u32 = 275; + pub const VRSAVE: u32 = 256; pub const PVR: u32 = 287; + pub const HID0: u32 = 1008; + pub const HID1: u32 = 1009; pub const PIR: u32 = 1023; } +/// LR halt sentinel. When `bclr` returns to this address, the interpreter +/// loop halts cleanly (matches the "entry returned" convention). +pub const LR_HALT_SENTINEL: u64 = 0xBCBC_BCBC; + +/// VSCR NJ (Non-Java mode) bit. Stored in word 3 at bit 16 (mask 0x0001_0000). +/// Set at startup; when clear, denormals are flushed to zero following IEEE-754. +pub const VSCR_NJ_MASK: u32 = 0x0001_0000; +/// VSCR SAT (saturation sticky) bit. Stored in word 3 at bit 31 (mask 0x0000_0001). +pub const VSCR_SAT_MASK: u32 = 0x0000_0001; + /// PowerPC processor context. Holds all register state for one guest thread. /// Mirrors PPCContext from ppc_context.h, minus JIT-specific fields. #[repr(C, align(64))] @@ -64,15 +85,39 @@ pub struct PpcContext { pub xer_ca: u8, pub xer_ov: u8, pub xer_so: u8, - // Altivec VSCR saturation bit - pub vscr_sat: u8, + // Altivec VSCR. Only bits 16 (NJ) and 31 (SAT) of word 3 are meaningful. + pub vscr: Vec128, + // VRSAVE (SPR 256). Bitmask of which VRs need saving across context switches. + pub vrsave: u32, // Program counter pub pc: u32, - // Reservation address/value for lwarx/stwcx - pub reserved_addr: u32, + // Reservation for lwarx/ldarx/stwcx/stdcx. Xenon's reservation granule is + // one L2 cache line (128 bytes) — `reserved_line` is stored as the base + // address of that line (`ea & !0x7F`). `has_reservation` gates the + // validity; stwcx./stdcx. check that both match before committing. + // `reserved_val` is retained for possible future use by a coherency + // observer; the store-conditional logic itself does not compare it. + pub reserved_line: u32, pub reserved_val: u64, pub has_reservation: bool, + /// M3.7 — generation stamp returned by [`crate::ReservationTable::reserve`] + /// at the most recent `lwarx`/`ldarx`. Paired with `reserved_line`; + /// `stwcx.`/`stdcx.` pass this back to `try_commit`. Meaningful only + /// when `reservation_table` is `Some` and the table is enabled. + pub reserved_generation: u32, + /// M3.7 — optional handle to the inter-thread reservation table. + /// When `Some(table)` *and* `table.is_enabled()`, the interpreter's + /// `lwarx`/`stwcx.`/`ldarx`/`stdcx.` arms route through the table; + /// otherwise they use the legacy per-`PpcContext` fields above. The + /// scheduler populates this when it spawns a thread under a kernel + /// that has `reservations` set. + pub reservation_table: Option>, + /// M3.7 — emulated HW slot ID this thread is bound to. Used as the + /// reservation table's `hw_id` discriminator so two threads on + /// different slots can't accidentally commit each other's + /// reservations. Populated by the scheduler at spawn / migration. + pub hw_id: u8, // Thread ID (for kernel use) pub thread_id: u32, @@ -82,6 +127,12 @@ pub struct PpcContext { // Time base (incremented each instruction for debugging) pub timebase: u64, + + // Decrementer (SPR 22): 32-bit down-counter that fires an external + // interrupt at underflow on real hw. Xenia-rs doesn't dispatch DEC + // interrupts to the guest; this value is maintained so that mfspr DEC + // returns something coherent. + pub dec: u32, } impl PpcContext { @@ -89,7 +140,9 @@ impl PpcContext { Self { gpr: [0; 32], ctr: 0, - lr: 0, + // Canary sets LR to the halt sentinel at thread start so `blr` + // from the top-level entry falls out of the interpreter loop. + lr: LR_HALT_SENTINEL, msr: 0, fpr: [0.0; 32], vr: [Vec128::ZERO; 128], @@ -98,14 +151,21 @@ impl PpcContext { xer_ca: 0, xer_ov: 0, xer_so: 0, - vscr_sat: 0, + // VSCR starts with NJ bit set (denormals flushed) — matches canary + // thread_state.cc initialization. + vscr: Vec128::from_u32x4(0, 0, 0, VSCR_NJ_MASK), + vrsave: 0xFFFF_FFFF, pc: 0, - reserved_addr: 0, + reserved_line: 0, reserved_val: 0, has_reservation: false, + reserved_generation: 0, + reservation_table: None, + hw_id: 0, thread_id: 0, cycle_count: 0, timebase: 0, + dec: 0, } } @@ -182,6 +242,27 @@ impl PpcContext { self.xer_ov = ((val >> 30) & 1) as u8; self.xer_ca = ((val >> 29) & 1) as u8; } + + /// Read the VSCR SAT (sticky saturation) bit. + pub fn vscr_sat(&self) -> bool { + (self.vscr.u32x4(3) & VSCR_SAT_MASK) != 0 + } + + /// Set or clear VSCR SAT. Preserves the NJ bit (and any other word-3 bits). + pub fn set_vscr_sat(&mut self, v: bool) { + let mut w = self.vscr.u32x4(3); + if v { + w |= VSCR_SAT_MASK; + } else { + w &= !VSCR_SAT_MASK; + } + self.vscr.set_u32x4(3, w); + } + + /// Read the VSCR NJ (non-Java mode / flush-denormals) bit. + pub fn vscr_nj(&self) -> bool { + (self.vscr.u32x4(3) & VSCR_NJ_MASK) != 0 + } } impl Default for PpcContext { diff --git a/crates/xenia-cpu/src/decoder.rs b/crates/xenia-cpu/src/decoder.rs index c84ddca..136d3e3 100644 --- a/crates/xenia-cpu/src/decoder.rs +++ b/crates/xenia-cpu/src/decoder.rs @@ -77,6 +77,9 @@ impl DecodedInstr { /// OE bit (bit 21) - overflow enable #[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 } + /// TO field (bits 6-10) for tw/twi/td/tdi trap instructions. + #[inline] pub fn to(&self) -> u32 { extract_bits(self.raw, 6, 10) } + /// MB, ME fields for rotate instructions #[inline] pub fn mb(&self) -> u32 { extract_bits(self.raw, 21, 25) } #[inline] pub fn me(&self) -> u32 { extract_bits(self.raw, 26, 30) } @@ -142,6 +145,24 @@ impl DecodedInstr { #[inline] pub fn nb(&self) -> u32 { extract_bits(self.raw, 16, 20) } } +/// Extract the 5-bit `UIMM` (`VX128_3`) / `IMM` (`VX128_4`) field. Canary +/// packs both formats with LSB-bits 16-20 holding the field, which is +/// MSB bits 11-15 in our `extract_bits` convention. For `vpkd3d128` / +/// `vupkd3d128` the decoded selector is `type = UIMM >> 2` (3 bits; valid +/// values 0-6 per [`crate::vmx::D3dPackType`], 7 is undocumented / +/// undefined in canary) and `pack = UIMM & 0x3` (output-slot layout for +/// `vpkd3d128` only, `vupkd3d128` ignores it). +/// +/// First-Pixels M3: the interpreter previously used a hand-rolled +/// `(instr.raw >> 6) & 0x7` that was **LSB-numbered** and extracted +/// bits from a completely different part of the word (the +/// secondary-opcode region). Centralizing the extractor here matches +/// canary's `FormatVX128_{3,4}::{UIMM,IMM}` field semantics exactly. +#[inline] +pub fn extract_vx128_uimm5(raw: u32) -> u32 { + extract_bits(raw, 11, 15) +} + /// Decode a 32-bit PPC instruction into its opcode. /// Direct translation of the C++ LookupOpcode from ppc_opcode_lookup_gen.cc. pub fn decode(raw: u32, addr: u32) -> DecodedInstr { @@ -149,6 +170,123 @@ pub fn decode(raw: u32, addr: u32) -> DecodedInstr { DecodedInstr { opcode, raw, addr } } +// Perf tier-2 — direct-mapped PC-keyed decode cache. +// +// The interpreter hot path spends ~15-25% of its time in `decode()` +// parsing the raw u32 and walking the primary+secondary opcode tables. +// For non-self-modifying guest code — the common case past the XEX +// loader — `decode(raw, pc)` is purely a function of `(raw, pc)` and +// the output is `Copy + 16B`. A direct-mapped cache indexed by +// `(pc >> 2) & MASK` gives the interpreter a 1-comparison fast path, +// at the cost of one branch and a 1.5 MiB region of memory. +// +// Invalidation piggybacks on `xenia_memory::GuestMemory::page_version` +// (P5 texture-cache invalidation): every cache entry carries the page +// version that was active at decode time; on lookup we compare against +// the current version of the containing 4 KiB page. Any write to the +// page bumps the counter, so the next decode on that PC is a miss that +// refills. + +/// Number of direct-mapped entries. 2^16 = 65,536 slots, one PPC +/// instruction address per slot — enough for every hot code path in a +/// typical Xbox 360 title to stay resident without collision. +const DECODE_CACHE_SIZE: usize = 1 << 16; +const DECODE_CACHE_MASK: u32 = (DECODE_CACHE_SIZE - 1) as u32; + +#[derive(Clone, Copy)] +struct DecodeCacheEntry { + /// Guest PC this entry was decoded at. Used as the tag on lookup; a + /// mismatch means the slot was last populated by a different PC that + /// shares the same low-16 index. + pc: u32, + /// Page version at decode time (from `GuestMemory::page_version(pc)`). + /// Zero means "unused slot" since real page versions start at 1. + page_version: u64, + decoded: DecodedInstr, +} + +impl DecodeCacheEntry { + const fn empty() -> Self { + // `Invalid` is the decoder's "unrecognized opcode" sentinel; we + // use it here as the empty-slot marker. Real misses compare `pc`, + // not the opcode, so the sentinel choice is cosmetic. + Self { + pc: 0, + page_version: 0, + decoded: DecodedInstr { + opcode: PpcOpcode::Invalid, + raw: 0, + addr: 0, + }, + } + } +} + +/// Direct-mapped PC-keyed decode cache. One instance shared across all +/// HW threads (PC is thread-independent; entries are read-only once +/// filled). Not thread-safe — the single scheduler thread owns it. +pub struct DecodeCache { + slots: Box<[DecodeCacheEntry]>, + hits: u64, + misses: u64, + invalidations: u64, +} + +impl Default for DecodeCache { + fn default() -> Self { + Self::new() + } +} + +impl DecodeCache { + pub fn new() -> Self { + Self { + slots: vec![DecodeCacheEntry::empty(); DECODE_CACHE_SIZE].into_boxed_slice(), + hits: 0, + misses: 0, + invalidations: 0, + } + } + + /// Look up (or fill) the decoded form of the instruction at `pc`. + /// `raw` is the fetched instruction word; `current_page_version` is + /// `mem.page_version(pc)` — the caller has it cheaper than we do, + /// since they're already touching `mem` to fetch `raw`. + #[inline] + pub fn lookup(&mut self, pc: u32, raw: u32, current_page_version: u64) -> DecodedInstr { + let idx = ((pc >> 2) & DECODE_CACHE_MASK) as usize; + // Safety: `idx` is masked into `[0, DECODE_CACHE_SIZE)` so the + // slice access is always in-bounds. Opt-out of the bounds check + // for the hot path. + let entry = unsafe { self.slots.get_unchecked_mut(idx) }; + if entry.pc == pc && entry.page_version == current_page_version { + self.hits += 1; + return entry.decoded; + } + if entry.pc == pc && entry.page_version != current_page_version { + self.invalidations += 1; + } + self.misses += 1; + let decoded = decode(raw, pc); + *entry = DecodeCacheEntry { + pc, + page_version: current_page_version, + decoded, + }; + decoded + } + + pub fn hits(&self) -> u64 { + self.hits + } + pub fn misses(&self) -> u64 { + self.misses + } + pub fn invalidations(&self) -> u64 { + self.invalidations + } +} + fn lookup_opcode(code: u32) -> PpcOpcode { match extract_bits(code, 0, 5) { 2 => PpcOpcode::tdi, @@ -781,6 +919,57 @@ mod tests { assert_eq!(instr.d(), 0x20); } + #[test] + fn decode_cache_miss_fills_then_hit() { + let mut cache = DecodeCache::new(); + let raw: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10; + let pc = 0x8200_0000u32; + let first = cache.lookup(pc, raw, 1); + assert_eq!(first.opcode, PpcOpcode::addi); + assert_eq!(cache.hits(), 0); + assert_eq!(cache.misses(), 1); + // Same pc, same version → cache hit, no new decode. + let second = cache.lookup(pc, raw, 1); + assert_eq!(second.opcode, PpcOpcode::addi); + assert_eq!(cache.hits(), 1); + assert_eq!(cache.misses(), 1); + } + + #[test] + fn decode_cache_stale_version_refills() { + let mut cache = DecodeCache::new(); + // First fill with an `addi`. + let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10; + let pc = 0x8200_0000u32; + cache.lookup(pc, raw_addi, 1); + // Guest rewrote the page: same pc, different raw + bumped version. + // Cache must refill — not return the stale `addi`. + let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20; + let refreshed = cache.lookup(pc, raw_lwz, 2); + assert_eq!(refreshed.opcode, PpcOpcode::lwz); + assert_eq!(cache.invalidations(), 1); + assert_eq!(cache.misses(), 2); + } + + #[test] + fn decode_cache_pc_collision_refills() { + // Two PCs that hash to the same slot (pc >> 2 low 16 bits equal) + // must not alias. Slot index = ((pc >> 2) & 0xFFFF) — pick two + // PCs 4 * 2^16 bytes apart. + let mut cache = DecodeCache::new(); + let pc_a = 0x8200_0000u32; + let pc_b = pc_a.wrapping_add(0x0004_0000u32); // (>> 2) differs by 2^16 + let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10; + let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20; + cache.lookup(pc_a, raw_addi, 1); + // Different pc but same slot → miss + refill. + cache.lookup(pc_b, raw_lwz, 1); + // First pc comes back → miss + refill (slot was taken by pc_b). + let back = cache.lookup(pc_a, raw_addi, 1); + assert_eq!(back.opcode, PpcOpcode::addi); + assert_eq!(cache.misses(), 3); + } + #[test] fn test_decode_branch() { // b +0x100 => opcode 18, LI=0x40 (shifted left 2 = 0x100), AA=0, LK=0 @@ -816,4 +1005,103 @@ mod tests { assert_eq!(extract_bits(0x8000_0000, 0, 0), 1); assert_eq!(extract_bits(0x0000_0001, 31, 31), 1); } + + // VMX128 register-name extraction. Locks the canonical bit positions + // (decoder.rs is the single source of truth — the analysis crate's + // old `ppc.rs` had different positions, which produced wrong printed + // register names; the bug was silent because the interpreter never + // used those extractors). Each test poke-bits exactly the slots the + // accessor reads and asserts the assembled register number. + + fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32, + vb16_20: u32, vb28: u32, vb30: u32) -> u32 { + // PPC bit i -> LSB position 31-i. + (vd6_10 << (31 - 10)) + | (va21 << (31 - 21)) // va128 high bit at PPC 29 in some forms — kept 0 here + | (vd21 << (31 - 21)) + | (vd22 << (31 - 22)) + | (vb16_20 << (31 - 20)) + | (vb28 << (31 - 28)) + | (vb30 << (31 - 30)) + } + + #[test] + fn vmx128_vd128_low_5_bits_only() { + // vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo + for r in 0..32u32 { + let raw = (r as u32) << (31 - 10); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vd128(), r as usize, "vd_lo={r}"); + } + } + + #[test] + fn vmx128_vd128_bit21_adds_32() { + // vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32 + let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vd128(), 32); + } + + #[test] + fn vmx128_vd128_bit22_adds_64() { + // vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64 + let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vd128(), 64); + } + + #[test] + fn vmx128_vd128_full_127() { + // vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127 + let raw = (31u32 << (31 - 10)) + | (1u32 << (31 - 21)) + | (1u32 << (31 - 22)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vd128(), 127); + } + + #[test] + fn vmx128_va128_uses_bit29() { + // va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39. + let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.va128(), 39); + } + + #[test] + fn vmx128_vb128_uses_bits28_and_30() { + // vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64. + let raw = (5u32 << (31 - 20)) + | (1u32 << (31 - 28)) + | (1u32 << (31 - 30)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vb128(), 5 | 32 | 64); + } + + #[test] + fn vmx128_vs128_aliases_vd128() { + // vs128 must always equal vd128. + for r in [0u32, 31, 32, 64, 96, 127] { + let lo = r & 0x1F; + let b21 = (r >> 5) & 1; + let b22 = (r >> 6) & 1; + let raw = (lo << (31 - 10)) + | (b21 << (31 - 21)) + | (b22 << (31 - 22)); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}"); + assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}"); + assert_eq!(d.vd128(), d.vs128()); + } + } + + #[test] + #[allow(dead_code)] + fn _vmx128_test_word_helper_compiles() { + // Keep the helper validated against the real accessor. + let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0); + let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 }; + assert_eq!(d.vd128(), 5 | 32 | 64); + } } diff --git a/crates/xenia-cpu/src/disasm.rs b/crates/xenia-cpu/src/disasm.rs index e4ee2ca..6edbce1 100644 --- a/crates/xenia-cpu/src/disasm.rs +++ b/crates/xenia-cpu/src/disasm.rs @@ -1,233 +1,627 @@ -use crate::decoder::DecodedInstr; +//! PowerPC (Xbox 360 Xenon) text disassembler. +//! +//! Single source of truth for assembly text formatting. Sits on top of the +//! canonical decoder in [`crate::decoder`] and consumes [`DecodedInstr`] +//! (8-byte `Copy`, no allocations) so the interpreter's decode cache stays +//! lean — formatting allocates, but only when a sink calls [`format`]. +//! +//! [`format`] returns a [`DisasmText`] carrying both base and extended +//! (simplified) mnemonic forms. Callers (text printer, JSON sink, DuckDB +//! row writer) consume the fields directly instead of re-parsing. + +use crate::decoder::{DecodedInstr, extract_vx128_uimm5}; use crate::opcode::PpcOpcode; -use std::fmt::Write; + +/// Formatted disassembly of a single instruction. +/// +/// Owns its strings. `mnemonic`/`operands` are the structured base form +/// (e.g. `"addi"`, `"r3, r1, 16"`); `disasm` is the legacy padded display +/// form (e.g. `"addi r3, r1, 16"`). The `ext_*` triple is `Some` when +/// a simplified/extended mnemonic applies (e.g. `addi r3,0,imm` → +/// `li r3, imm`). `branch_target` is the resolved absolute target for +/// direct branches (`b`/`bl`/`bc`/`bcl`); `None` for indirect branches +/// and non-branches. +#[derive(Debug, Clone)] +pub struct DisasmText { + pub mnemonic: String, + pub operands: String, + pub disasm: String, + pub ext_mnemonic: Option, + pub ext_operands: Option, + pub ext_disasm: Option, + pub branch_target: Option, +} + +impl DisasmText { + /// Preferred display form: extended if present, else base. + #[inline] + pub fn display(&self) -> &str { + self.ext_disasm.as_deref().unwrap_or(&self.disasm) + } +} + +// ── Internal builders ─────────────────────────────────────────────────────── + +#[inline] +fn pad_into(mnem: &str, operands: &str, width: usize) -> String { + if width <= mnem.len() + 1 { + // No padding fits — fall back to single-space join. + if operands.is_empty() { mnem.to_string() } + else { format!("{mnem} {operands}") } + } else { + format!("{: DisasmText { + let disasm = pad_into(mnem, &operands, pad); + DisasmText { + mnemonic: mnem.to_string(), + operands, + disasm, + ext_mnemonic: None, + ext_operands: None, + ext_disasm: None, + branch_target: None, + } +} + +fn with_ext( + base_mnem: &str, base_ops: String, base_pad: usize, + ext_mnem: &str, ext_ops: String, ext_pad: usize, +) -> DisasmText { + let disasm = pad_into(base_mnem, &base_ops, base_pad); + let ext_disasm = pad_into(ext_mnem, &ext_ops, ext_pad); + DisasmText { + mnemonic: base_mnem.to_string(), + operands: base_ops, + disasm, + ext_mnemonic: Some(ext_mnem.to_string()), + ext_operands: Some(ext_ops), + ext_disasm: Some(ext_disasm), + branch_target: None, + } +} + +fn with_target(mut t: DisasmText, target: u32) -> DisasmText { + t.branch_target = Some(target); + t +} + +fn long_word(raw: u32) -> DisasmText { + let operands = format!("0x{raw:08X}"); + base(".long", operands, 8) +} + +// ── Helpers (register names, sign extension, condition decoding) ──────────── + +#[inline] fn gpr(r: usize) -> String { format!("r{r}") } +#[inline] fn fpr(r: usize) -> String { format!("f{r}") } +#[inline] fn vr(r: usize) -> String { format!("v{r}") } + +fn crb(b: u32) -> String { + let cr = b / 4; + let bit = b % 4; + let bit_name = ["lt", "gt", "eq", "so"][bit as usize]; + if cr == 0 { bit_name.to_string() } else { format!("4*cr{cr}+{bit_name}") } +} + +fn spr_name(spr: u32) -> String { + match spr { + 1 => "XER".into(), + 8 => "LR".into(), + 9 => "CTR".into(), + _ => format!("spr{spr}"), + } +} + +#[inline] fn sign_ext(val: u32, bits: u32) -> i32 { + let shift = 32 - bits; + ((val << shift) as i32) >> shift +} + +/// Map trap TO field to condition suffix (e.g. 16 → "lt", 4 → "eq"). +/// Unsigned variants (`lgt`/`llt`/`lge`/`lle`) cover bits 1-3 of the TO +/// encoding which `tw`/`td` use for logical-compare conditions. +fn trap_cond(to: u32) -> Option<&'static str> { + match to { + 1 => Some("lgt"), + 2 => Some("llt"), + 4 => Some("eq"), + 5 => Some("lge"), + 6 => Some("lle"), + 8 => Some("gt"), + 12 => Some("ge"), + 16 => Some("lt"), + 20 => Some("le"), + 24 => Some("ne"), + 31 => Some(""), // unconditional + _ => None, + } +} + +/// For non-decrementing conditional branches: returns Some((cond_name, cr_prefix)) +/// where cr_prefix is e.g. "" or "cr2, ". +fn cond_branch_ext(bo: u32, bi: u32) -> Option<(&'static str, String)> { + let cond_true = bo & 0x08 != 0; + let no_cond = bo & 0x10 != 0; + let decr = bo & 0x04 == 0; + if no_cond || decr { return None; } + + let cr_field = bi / 4; + let cr_bit = bi % 4; + let cond_name = match (cr_bit, cond_true) { + (0, true) => "lt", (0, false) => "ge", + (1, true) => "gt", (1, false) => "le", + (2, true) => "eq", (2, false) => "ne", + (3, true) => "so", (3, false) => "ns", + _ => return None, + }; + let cr = if cr_field == 0 { String::new() } else { format!("cr{cr_field}, ") }; + Some((cond_name, cr)) +} + +#[inline] fn rc_dot(instr: &DecodedInstr) -> &'static str { + if instr.rc_bit() { "." } else { "" } +} + +// ── Public entrypoints ────────────────────────────────────────────────────── + +/// Format a decoded instruction into structured disassembly text. +pub fn format(instr: &DecodedInstr) -> DisasmText { + match instr.opcode { + // ── Branch ────────────────────────────────────────────────────────── + PpcOpcode::bx => fmt_b(instr), + PpcOpcode::bcx => fmt_bc(instr), + PpcOpcode::bclrx => fmt_bclr(instr), + PpcOpcode::bcctrx => fmt_bcctr(instr), + PpcOpcode::sc => base("sc", String::new(), 0), + + // ── Trap ──────────────────────────────────────────────────────────── + PpcOpcode::tdi => fmt_trap_imm(instr, "tdi", "td"), + PpcOpcode::twi => fmt_trap_imm(instr, "twi", "tw"), + PpcOpcode::td => fmt_trap_reg(instr, "td"), + PpcOpcode::tw => fmt_trap_reg(instr, "tw"), + + // ── D-form ALU/logical ────────────────────────────────────────────── + PpcOpcode::addi => fmt_addi(instr), + PpcOpcode::addis => fmt_addis(instr), + PpcOpcode::addic => fmt_d_add(instr, "addic"), + PpcOpcode::addicx => fmt_d_add(instr, "addic."), + PpcOpcode::subficx => fmt_d_imm_simple(instr, "subfic"), + PpcOpcode::mulli => fmt_d_imm_simple(instr, "mulli"), + PpcOpcode::cmpi => fmt_cmp_imm(instr, "cmpi", true), + PpcOpcode::cmpli => fmt_cmp_imm(instr, "cmpli", false), + PpcOpcode::ori => fmt_ori(instr), + PpcOpcode::oris => fmt_d_logic(instr, "oris"), + PpcOpcode::xori => fmt_d_logic(instr, "xori"), + PpcOpcode::xoris => fmt_d_logic(instr, "xoris"), + PpcOpcode::andix => fmt_d_logic(instr, "andi."), + PpcOpcode::andisx => fmt_d_logic(instr, "andis."), + + // ── D-form load/store ─────────────────────────────────────────────── + PpcOpcode::lwz => fmt_ld(instr, "lwz", false), + PpcOpcode::lwzu => fmt_ld(instr, "lwzu", false), + PpcOpcode::lbz => fmt_ld(instr, "lbz", false), + PpcOpcode::lbzu => fmt_ld(instr, "lbzu", false), + PpcOpcode::lhz => fmt_ld(instr, "lhz", false), + PpcOpcode::lhzu => fmt_ld(instr, "lhzu", false), + PpcOpcode::lha => fmt_ld(instr, "lha", false), + PpcOpcode::lhau => fmt_ld(instr, "lhau", false), + PpcOpcode::lmw => fmt_ld(instr, "lmw", false), + PpcOpcode::lfs => fmt_ld(instr, "lfs", true), + PpcOpcode::lfsu => fmt_ld(instr, "lfsu", true), + PpcOpcode::lfd => fmt_ld(instr, "lfd", true), + PpcOpcode::lfdu => fmt_ld(instr, "lfdu", true), + PpcOpcode::stw => fmt_st(instr, "stw", false), + PpcOpcode::stwu => fmt_st(instr, "stwu", false), + PpcOpcode::stb => fmt_st(instr, "stb", false), + PpcOpcode::stbu => fmt_st(instr, "stbu", false), + PpcOpcode::sth => fmt_st(instr, "sth", false), + PpcOpcode::sthu => fmt_st(instr, "sthu", false), + PpcOpcode::stmw => fmt_st(instr, "stmw", false), + PpcOpcode::stfs => fmt_st(instr, "stfs", true), + PpcOpcode::stfsu => fmt_st(instr, "stfsu", true), + PpcOpcode::stfd => fmt_st(instr, "stfd", true), + PpcOpcode::stfdu => fmt_st(instr, "stfdu", true), + + // ── DS-form load/store ────────────────────────────────────────────── + PpcOpcode::ld => fmt_ds(instr, "ld"), + PpcOpcode::ldu => fmt_ds(instr, "ldu"), + PpcOpcode::lwa => fmt_ds(instr, "lwa"), + PpcOpcode::std => fmt_ds(instr, "std"), + PpcOpcode::stdu => fmt_ds(instr, "stdu"), + + // ── Rotate ───────────────────────────────────────────────────────── + PpcOpcode::rlwimix => fmt_rlwimi(instr), + PpcOpcode::rlwinmx => fmt_rlwinm(instr), + PpcOpcode::rlwnmx => fmt_rlwnm(instr), + PpcOpcode::rldiclx => fmt_rldicl(instr), + PpcOpcode::rldicrx => fmt_rldicr(instr), + PpcOpcode::rldicx => fmt_rldic(instr), + PpcOpcode::rldimix => fmt_rldimi(instr), + PpcOpcode::rldclx => fmt_rldcl(instr), + PpcOpcode::rldcrx => fmt_rldcr(instr), + + // ── Compare (X-form) ─────────────────────────────────────────────── + PpcOpcode::cmp => fmt_cmp_reg(instr, "cmp"), + PpcOpcode::cmpl => fmt_cmp_reg(instr, "cmpl"), + + // ── X-form ALU (3-register) with OE/Rc ───────────────────────────── + PpcOpcode::addx => fmt_xo_3op(instr, "add"), + PpcOpcode::addcx => fmt_xo_3op(instr, "addc"), + PpcOpcode::addex => fmt_xo_3op(instr, "adde"), + PpcOpcode::addmex => fmt_xo_2op(instr, "addme"), + PpcOpcode::addzex => fmt_xo_2op(instr, "addze"), + PpcOpcode::subfx => fmt_subf(instr, "subf", "sub"), + PpcOpcode::subfcx => fmt_subf(instr, "subfc", "subc"), + PpcOpcode::subfex => fmt_xo_3op(instr, "subfe"), + PpcOpcode::subfmex => fmt_xo_2op(instr, "subfme"), + PpcOpcode::subfzex => fmt_xo_2op(instr, "subfze"), + PpcOpcode::negx => fmt_xo_2op(instr, "neg"), + PpcOpcode::mullwx => fmt_xo_3op(instr, "mullw"), + PpcOpcode::mulhwx => fmt_xo_3op_no_oe(instr, "mulhw"), + PpcOpcode::mulhwux => fmt_xo_3op_rc_only(instr, "mulhwu"), + PpcOpcode::divwx => fmt_xo_3op(instr, "divw"), + PpcOpcode::divwux => fmt_xo_3op(instr, "divwu"), + PpcOpcode::mulldx => fmt_xo_3op(instr, "mulld"), + PpcOpcode::mulhdx => fmt_xo_3op_rc_only(instr, "mulhd"), + PpcOpcode::mulhdux => fmt_xo_3op_rc_only(instr, "mulhdu"), + PpcOpcode::divdx => fmt_xo_3op(instr, "divd"), + PpcOpcode::divdux => fmt_xo_3op(instr, "divdu"), + + // ── X-form logical (Rc) ──────────────────────────────────────────── + PpcOpcode::andx => fmt_logic_and(instr), + PpcOpcode::andcx => fmt_x_logic(instr, "andc"), + PpcOpcode::orx => fmt_logic_or(instr), + PpcOpcode::orcx => fmt_x_logic(instr, "orc"), + PpcOpcode::xorx => fmt_x_logic(instr, "xor"), + PpcOpcode::norx => fmt_logic_nor(instr), + PpcOpcode::nandx => fmt_x_logic(instr, "nand"), + PpcOpcode::eqvx => fmt_x_logic(instr, "eqv"), + PpcOpcode::extsbx => fmt_x_unary_rc(instr, "extsb"), + PpcOpcode::extshx => fmt_x_unary_rc(instr, "extsh"), + PpcOpcode::extswx => fmt_x_unary_rc(instr, "extsw"), + PpcOpcode::cntlzwx => fmt_x_unary_rc(instr, "cntlzw"), + PpcOpcode::cntlzdx => fmt_x_unary_rc(instr, "cntlzd"), + + // ── Shift (32 / 64) ───────────────────────────────────────────────── + PpcOpcode::slwx => fmt_x_logic(instr, "slw"), + PpcOpcode::srwx => fmt_x_logic(instr, "srw"), + PpcOpcode::srawx => fmt_x_logic(instr, "sraw"), + PpcOpcode::sldx => fmt_x_logic(instr, "sld"), + PpcOpcode::srdx => fmt_x_logic(instr, "srd"), + PpcOpcode::sradx => fmt_x_logic(instr, "srad"), + PpcOpcode::srawix => fmt_srawi(instr), + PpcOpcode::sradix => fmt_sradi(instr), + + // ── Special register moves ───────────────────────────────────────── + PpcOpcode::mfspr => fmt_mfspr(instr), + PpcOpcode::mtspr => fmt_mtspr(instr), + PpcOpcode::mfcr => base("mfcr", gpr(instr.rd()), 8), + PpcOpcode::mtcrf => fmt_mtcrf(instr), + PpcOpcode::mfmsr => base("mfmsr", gpr(instr.rd()), 8), + PpcOpcode::mtmsr => base("mtmsr", gpr(instr.rs()), 8), + PpcOpcode::mtmsrd => base("mtmsrd", gpr(instr.rs()), 8), + PpcOpcode::mftb => fmt_mftb(instr), + PpcOpcode::mcrxr => base("mcrxr", format!("cr{}", instr.crfd()), 8), + PpcOpcode::mcrf => base("mcrf", format!("cr{}, cr{}", instr.crfd(), instr.crfs()), 8), + + // ── X-form indexed load/store ────────────────────────────────────── + PpcOpcode::lwzx => fmt_x_load(instr, "lwzx", false), + PpcOpcode::lwzux => fmt_x_load(instr, "lwzux", false), + PpcOpcode::lbzx => fmt_x_load(instr, "lbzx", false), + PpcOpcode::lbzux => fmt_x_load(instr, "lbzux", false), + PpcOpcode::lhzx => fmt_x_load(instr, "lhzx", false), + PpcOpcode::lhzux => fmt_x_load(instr, "lhzux", false), + PpcOpcode::lhax => fmt_x_load(instr, "lhax", false), + PpcOpcode::lhaux => fmt_x_load(instr, "lhaux", false), + PpcOpcode::lwax => fmt_x_load(instr, "lwax", false), + PpcOpcode::lwaux => fmt_x_load(instr, "lwaux", false), + PpcOpcode::ldx => fmt_x_load(instr, "ldx", false), + PpcOpcode::ldux => fmt_x_load(instr, "ldux", false), + PpcOpcode::lwbrx => fmt_x_load(instr, "lwbrx", false), + PpcOpcode::lhbrx => fmt_x_load(instr, "lhbrx", false), + PpcOpcode::ldbrx => fmt_x_load(instr, "ldbrx", false), + PpcOpcode::lwarx => fmt_x_load(instr, "lwarx", false), + PpcOpcode::ldarx => fmt_x_load(instr, "ldarx", false), + PpcOpcode::lswx => fmt_x_load(instr, "lswx", false), + PpcOpcode::lswi => fmt_lswi_stswi(instr, "lswi"), + PpcOpcode::lfsx => fmt_x_load(instr, "lfsx", true), + PpcOpcode::lfsux => fmt_x_load(instr, "lfsux", true), + PpcOpcode::lfdx => fmt_x_load(instr, "lfdx", true), + PpcOpcode::lfdux => fmt_x_load(instr, "lfdux", true), + PpcOpcode::stwx => fmt_x_store(instr, "stwx", false), + PpcOpcode::stwux => fmt_x_store(instr, "stwux", false), + PpcOpcode::stbx => fmt_x_store(instr, "stbx", false), + PpcOpcode::stbux => fmt_x_store(instr, "stbux", false), + PpcOpcode::sthx => fmt_x_store(instr, "sthx", false), + PpcOpcode::sthux => fmt_x_store(instr, "sthux", false), + PpcOpcode::stdx => fmt_x_store(instr, "stdx", false), + PpcOpcode::stdux => fmt_x_store(instr, "stdux", false), + PpcOpcode::stwbrx => fmt_x_store(instr, "stwbrx", false), + PpcOpcode::sthbrx => fmt_x_store(instr, "sthbrx", false), + PpcOpcode::stdbrx => fmt_x_store(instr, "stdbrx", false), + PpcOpcode::stwcx => fmt_x_store(instr, "stwcx.", false), + PpcOpcode::stdcx => fmt_x_store(instr, "stdcx.", false), + PpcOpcode::stswx => fmt_x_store(instr, "stswx", false), + PpcOpcode::stswi => fmt_lswi_stswi(instr, "stswi"), + PpcOpcode::stfsx => fmt_x_store(instr, "stfsx", true), + PpcOpcode::stfsux => fmt_x_store(instr, "stfsux", true), + PpcOpcode::stfdx => fmt_x_store(instr, "stfdx", true), + PpcOpcode::stfdux => fmt_x_store(instr, "stfdux", true), + PpcOpcode::stfiwx => fmt_x_store(instr, "stfiwx", true), + + // ── Cache / sync ──────────────────────────────────────────────────── + PpcOpcode::dcbf => fmt_cache(instr, "dcbf"), + PpcOpcode::dcbi => fmt_cache(instr, "dcbi"), + PpcOpcode::dcbst => fmt_cache(instr, "dcbst"), + PpcOpcode::dcbt => fmt_cache(instr, "dcbt"), + PpcOpcode::dcbtst => fmt_cache(instr, "dcbtst"), + PpcOpcode::dcbz => fmt_cache(instr, "dcbz"), + PpcOpcode::dcbz128 => fmt_cache(instr, "dcbz128"), + PpcOpcode::icbi => fmt_cache(instr, "icbi"), + PpcOpcode::sync => base("sync", String::new(), 0), + PpcOpcode::eieio => base("eieio", String::new(), 0), + PpcOpcode::isync => base("isync", String::new(), 0), + + // ── CR logical ────────────────────────────────────────────────────── + PpcOpcode::crand => fmt_cr_logic(instr, "crand"), + PpcOpcode::crandc => fmt_cr_logic(instr, "crandc"), + PpcOpcode::creqv => fmt_creqv(instr), + PpcOpcode::crnand => fmt_cr_logic(instr, "crnand"), + PpcOpcode::crnor => fmt_crnor(instr), + PpcOpcode::cror => fmt_cror(instr), + PpcOpcode::crorc => fmt_cr_logic(instr, "crorc"), + PpcOpcode::crxor => fmt_crxor(instr), + + // ── FPU (op59 / op63) ────────────────────────────────────────────── + PpcOpcode::fdivsx => fmt_a_3op(instr, "fdivs", false), + PpcOpcode::fsubsx => fmt_a_3op(instr, "fsubs", false), + PpcOpcode::faddsx => fmt_a_3op(instr, "fadds", false), + PpcOpcode::fsqrtsx => fmt_a_unary(instr, "fsqrts"), + PpcOpcode::fresx => fmt_a_unary(instr, "fres"), + PpcOpcode::fmulsx => fmt_a_3op(instr, "fmuls", true), + PpcOpcode::fmsubsx => fmt_a_4op(instr, "fmsubs"), + PpcOpcode::fmaddsx => fmt_a_4op(instr, "fmadds"), + PpcOpcode::fnmsubsx => fmt_a_4op(instr, "fnmsubs"), + PpcOpcode::fnmaddsx => fmt_a_4op(instr, "fnmadds"), + + PpcOpcode::fdivx => fmt_a_3op(instr, "fdiv", false), + PpcOpcode::fsubx => fmt_a_3op(instr, "fsub", false), + PpcOpcode::faddx => fmt_a_3op(instr, "fadd", false), + PpcOpcode::fsqrtx => fmt_a_unary(instr, "fsqrt"), + PpcOpcode::fselx => fmt_a_4op(instr, "fsel"), + PpcOpcode::fmulx => fmt_a_3op(instr, "fmul", true), + PpcOpcode::frsqrtex => fmt_a_unary(instr, "frsqrte"), + PpcOpcode::fmsubx => fmt_a_4op(instr, "fmsub"), + PpcOpcode::fmaddx => fmt_a_4op(instr, "fmadd"), + PpcOpcode::fnmsubx => fmt_a_4op(instr, "fnmsub"), + PpcOpcode::fnmaddx => fmt_a_4op(instr, "fnmadd"), + + PpcOpcode::fcmpu => fmt_fcmp(instr, "fcmpu"), + PpcOpcode::fcmpo => fmt_fcmp(instr, "fcmpo"), + PpcOpcode::frspx => fmt_x_fpu_unary(instr, "frsp"), + PpcOpcode::fctiwx => fmt_x_fpu_unary(instr, "fctiw"), + PpcOpcode::fctiwzx => fmt_x_fpu_unary(instr, "fctiwz"), + PpcOpcode::fnegx => fmt_x_fpu_unary(instr, "fneg"), + PpcOpcode::fmrx => fmt_x_fpu_unary(instr, "fmr"), + PpcOpcode::fnabsx => fmt_x_fpu_unary(instr, "fnabs"), + PpcOpcode::fabsx => fmt_x_fpu_unary(instr, "fabs"), + PpcOpcode::fctidx => fmt_x_fpu_unary(instr, "fctid"), + PpcOpcode::fctidzx => fmt_x_fpu_unary(instr, "fctidz"), + PpcOpcode::fcfidx => fmt_x_fpu_unary(instr, "fcfid"), + PpcOpcode::mffsx => { + let rc = rc_dot(instr); + base(&format!("mffs{rc}"), fpr(instr.rd()), 8) + } + PpcOpcode::mtfsfx => { + let rc = rc_dot(instr); + let fxm = (instr.raw >> 17) & 0xFF; + let frb = (instr.raw >> 11) & 0x1F; + base(&format!("mtfsf{rc}"), format!("0x{fxm:02X}, {}", fpr(frb as usize)), 8) + } + PpcOpcode::mtfsb1x => fmt_mtfsb(instr, "mtfsb1"), + PpcOpcode::mtfsb0x => fmt_mtfsb(instr, "mtfsb0"), + PpcOpcode::mtfsfix => { + let rc = rc_dot(instr); + let bf = instr.crfd(); + let imm = (instr.raw >> 12) & 0xF; + base(&format!("mtfsfi{rc}"), format!("cr{bf}, {imm}"), 8) + } + PpcOpcode::mcrfs => base("mcrfs", format!("cr{}, cr{}", instr.crfd(), instr.crfs()), 8), + + // ── Standard VMX (5-bit registers) ──────────────────────────────── + // 3-operand VD, VA, VB + PpcOpcode::vaddubm | PpcOpcode::vmaxub | PpcOpcode::vrlb | PpcOpcode::vmuloub | + PpcOpcode::vaddfp | PpcOpcode::vmrghb | PpcOpcode::vpkuhum | + PpcOpcode::vadduhm | PpcOpcode::vmaxuh | PpcOpcode::vrlh | PpcOpcode::vmulouh | + PpcOpcode::vsubfp | PpcOpcode::vmrghh | PpcOpcode::vpkuwum | + PpcOpcode::vadduwm | PpcOpcode::vmaxuw | PpcOpcode::vrlw | PpcOpcode::vmrghw | + PpcOpcode::vpkuhus | PpcOpcode::vpkuwus | + PpcOpcode::vmaxsb | PpcOpcode::vslb | PpcOpcode::vmulosb | PpcOpcode::vmrglb | + PpcOpcode::vpkshus | PpcOpcode::vmaxsh | PpcOpcode::vslh | PpcOpcode::vmulosh | + PpcOpcode::vmrglh | PpcOpcode::vpkswus | PpcOpcode::vaddcuw | PpcOpcode::vmaxsw | + PpcOpcode::vslw | PpcOpcode::vmrglw | PpcOpcode::vpkshss | PpcOpcode::vsl | + PpcOpcode::vpkswss | PpcOpcode::vaddubs | PpcOpcode::vminub | PpcOpcode::vsrb | + PpcOpcode::vmuleub | PpcOpcode::vadduhs | PpcOpcode::vminuh | PpcOpcode::vsrh | + PpcOpcode::vmuleuh | PpcOpcode::vadduws | PpcOpcode::vminuw | PpcOpcode::vsrw | + PpcOpcode::vsr | PpcOpcode::vaddsbs | PpcOpcode::vminsb | PpcOpcode::vsrab | + PpcOpcode::vmulesb | PpcOpcode::vpkpx | PpcOpcode::vaddshs | PpcOpcode::vminsh | + PpcOpcode::vsrah | PpcOpcode::vmulesh | PpcOpcode::vaddsws | PpcOpcode::vminsw | + PpcOpcode::vsraw | PpcOpcode::vsububm | PpcOpcode::vavgub | PpcOpcode::vand | + PpcOpcode::vmaxfp | PpcOpcode::vslo | PpcOpcode::vsubuhm | PpcOpcode::vavguh | + PpcOpcode::vandc | PpcOpcode::vminfp | PpcOpcode::vsro | PpcOpcode::vsubuwm | + PpcOpcode::vavguw | PpcOpcode::vor | PpcOpcode::vxor | PpcOpcode::vavgsb | + PpcOpcode::vnor | PpcOpcode::vavgsh | PpcOpcode::vsubcuw | PpcOpcode::vavgsw | + PpcOpcode::vsububs | PpcOpcode::vsum4ubs| PpcOpcode::vsubuhs | PpcOpcode::vsum4shs | + PpcOpcode::vsubuws | PpcOpcode::vsum2sws| PpcOpcode::vsubsbs | PpcOpcode::vsum4sbs | + PpcOpcode::vsubshs | PpcOpcode::vsubsws | PpcOpcode::vsumsws => { + fmt_vmx_3op(instr, opcode_name(instr.opcode)) + } + + // VMX unary VD, VB + PpcOpcode::vrefp | PpcOpcode::vrsqrtefp | PpcOpcode::vexptefp | + PpcOpcode::vlogefp | PpcOpcode::vrfin | PpcOpcode::vrfiz | + PpcOpcode::vrfip | PpcOpcode::vrfim | PpcOpcode::vupkhsb | + PpcOpcode::vupkhsh | PpcOpcode::vupklsb | PpcOpcode::vupklsh | + PpcOpcode::vupkhpx | PpcOpcode::vupklpx => { + fmt_vmx_unary(instr, opcode_name(instr.opcode)) + } + + // VMX VD, VB, UIMM (VA = uimm field) + PpcOpcode::vspltb | PpcOpcode::vsplth | PpcOpcode::vspltw | + PpcOpcode::vcfux | PpcOpcode::vcfsx | + PpcOpcode::vctuxs | PpcOpcode::vctsxs => { + fmt_vmx_uimm(instr, opcode_name(instr.opcode)) + } + + // VMX VD, SIMM (VA field as 5-bit signed immediate) + PpcOpcode::vspltisb => fmt_vmx_simm(instr, "vspltisb"), + PpcOpcode::vspltish => fmt_vmx_simm(instr, "vspltish"), + PpcOpcode::vspltisw => fmt_vmx_simm(instr, "vspltisw"), + + PpcOpcode::mfvscr => base("mfvscr", vr(instr.rd()), 8), + PpcOpcode::mtvscr => base("mtvscr", vr(instr.rb()), 8), + + // VMX compare (Rc bit at bit 21) + PpcOpcode::vcmpequb | PpcOpcode::vcmpequh | PpcOpcode::vcmpequw | + PpcOpcode::vcmpeqfp | PpcOpcode::vcmpgefp | PpcOpcode::vcmpgtub | + PpcOpcode::vcmpgtuh | PpcOpcode::vcmpgtuw | PpcOpcode::vcmpgtfp | + PpcOpcode::vcmpgtsb | PpcOpcode::vcmpgtsh | PpcOpcode::vcmpgtsw | + PpcOpcode::vcmpbfp => fmt_vmx_cmp(instr, opcode_name(instr.opcode)), + + // VMX 4-operand VD, VA, VB, VC + PpcOpcode::vmhaddshs | PpcOpcode::vmhraddshs | PpcOpcode::vmladduhm | + PpcOpcode::vmsumubm | PpcOpcode::vmsummbm | PpcOpcode::vmsumuhm | + PpcOpcode::vmsumuhs | PpcOpcode::vmsumshm | PpcOpcode::vmsumshs | + PpcOpcode::vsel | PpcOpcode::vperm => { + fmt_vmx_4op(instr, opcode_name(instr.opcode)) + } + + PpcOpcode::vsldoi => fmt_vsldoi(instr), + PpcOpcode::vmaddfp => fmt_vmx_4op_swap(instr, "vmaddfp"), + PpcOpcode::vnmsubfp => fmt_vmx_4op_swap(instr, "vnmsubfp"), + + // ── VMX128 load/store (uses GPR addressing + vd128 dest) ─────────── + PpcOpcode::lvsl128 => fmt_vmx128_ls(instr, "lvsl128"), + PpcOpcode::lvsr128 => fmt_vmx128_ls(instr, "lvsr128"), + PpcOpcode::lvewx128 => fmt_vmx128_ls(instr, "lvewx128"), + PpcOpcode::lvx128 => fmt_vmx128_ls(instr, "lvx128"), + PpcOpcode::lvxl128 => fmt_vmx128_ls(instr, "lvxl128"), + PpcOpcode::lvlx128 => fmt_vmx128_ls(instr, "lvlx128"), + PpcOpcode::lvrx128 => fmt_vmx128_ls(instr, "lvrx128"), + PpcOpcode::lvlxl128 => fmt_vmx128_ls(instr, "lvlxl128"), + PpcOpcode::lvrxl128 => fmt_vmx128_ls(instr, "lvrxl128"), + PpcOpcode::stvewx128 => fmt_vmx128_ls(instr, "stvewx128"), + PpcOpcode::stvx128 => fmt_vmx128_ls(instr, "stvx128"), + PpcOpcode::stvxl128 => fmt_vmx128_ls(instr, "stvxl128"), + PpcOpcode::stvlx128 => fmt_vmx128_ls(instr, "stvlx128"), + PpcOpcode::stvrx128 => fmt_vmx128_ls(instr, "stvrx128"), + PpcOpcode::stvlxl128 => fmt_vmx128_ls(instr, "stvlxl128"), + PpcOpcode::stvrxl128 => fmt_vmx128_ls(instr, "stvrxl128"), + + // Standard AltiVec load/store indexed (5-bit vr0-vr31) + PpcOpcode::lvsl => fmt_vmx_ls(instr, "lvsl"), + PpcOpcode::lvsr => fmt_vmx_ls(instr, "lvsr"), + PpcOpcode::lvebx => fmt_vmx_ls(instr, "lvebx"), + PpcOpcode::lvehx => fmt_vmx_ls(instr, "lvehx"), + PpcOpcode::lvewx => fmt_vmx_ls(instr, "lvewx"), + PpcOpcode::lvx => fmt_vmx_ls(instr, "lvx"), + PpcOpcode::lvxl => fmt_vmx_ls(instr, "lvxl"), + PpcOpcode::lvlx => fmt_vmx_ls(instr, "lvlx"), + PpcOpcode::lvrx => fmt_vmx_ls(instr, "lvrx"), + PpcOpcode::lvlxl => fmt_vmx_ls(instr, "lvlxl"), + PpcOpcode::lvrxl => fmt_vmx_ls(instr, "lvrxl"), + PpcOpcode::stvebx => fmt_vmx_ls(instr, "stvebx"), + PpcOpcode::stvehx => fmt_vmx_ls(instr, "stvehx"), + PpcOpcode::stvewx => fmt_vmx_ls(instr, "stvewx"), + PpcOpcode::stvx => fmt_vmx_ls(instr, "stvx"), + PpcOpcode::stvxl => fmt_vmx_ls(instr, "stvxl"), + PpcOpcode::stvlx => fmt_vmx_ls(instr, "stvlx"), + PpcOpcode::stvrx => fmt_vmx_ls(instr, "stvrx"), + PpcOpcode::stvlxl => fmt_vmx_ls(instr, "stvlxl"), + PpcOpcode::stvrxl => fmt_vmx_ls(instr, "stvrxl"), + + // ── VMX128 op5 (3-op and 4-op fp/pack/logic) ─────────────────────── + PpcOpcode::vaddfp128 => fmt_vmx128_3op(instr, "vaddfp128"), + PpcOpcode::vsubfp128 => fmt_vmx128_3op(instr, "vsubfp128"), + PpcOpcode::vmulfp128 => fmt_vmx128_3op(instr, "vmulfp128"), + PpcOpcode::vmsum3fp128 => fmt_vmx128_3op(instr, "vmsum3fp128"), + PpcOpcode::vmsum4fp128 => fmt_vmx128_3op(instr, "vmsum4fp128"), + PpcOpcode::vpkshss128 => fmt_vmx128_3op(instr, "vpkshss128"), + PpcOpcode::vpkshus128 => fmt_vmx128_3op(instr, "vpkshus128"), + PpcOpcode::vpkswss128 => fmt_vmx128_3op(instr, "vpkswss128"), + PpcOpcode::vpkswus128 => fmt_vmx128_3op(instr, "vpkswus128"), + PpcOpcode::vpkuhum128 => fmt_vmx128_3op(instr, "vpkuhum128"), + PpcOpcode::vpkuhus128 => fmt_vmx128_3op(instr, "vpkuhus128"), + PpcOpcode::vpkuwum128 => fmt_vmx128_3op(instr, "vpkuwum128"), + PpcOpcode::vpkuwus128 => fmt_vmx128_3op(instr, "vpkuwus128"), + PpcOpcode::vand128 => fmt_vmx128_3op(instr, "vand128"), + PpcOpcode::vandc128 => fmt_vmx128_3op(instr, "vandc128"), + PpcOpcode::vnor128 => fmt_vmx128_3op(instr, "vnor128"), + PpcOpcode::vor128 => fmt_vmx128_3op(instr, "vor128"), + PpcOpcode::vxor128 => fmt_vmx128_3op(instr, "vxor128"), + PpcOpcode::vsel128 => fmt_vmx128_3op(instr, "vsel128"), + PpcOpcode::vslo128 => fmt_vmx128_3op(instr, "vslo128"), + PpcOpcode::vsro128 => fmt_vmx128_3op(instr, "vsro128"), + + PpcOpcode::vmaddfp128 => fmt_vmaddfp128(instr), + PpcOpcode::vmaddcfp128 => fmt_vmx128_madd_vd_vb(instr, "vmaddcfp128"), + PpcOpcode::vnmsubfp128 => fmt_vmx128_madd_vd_vb(instr, "vnmsubfp128"), + + PpcOpcode::vperm128 => fmt_vperm128(instr), + PpcOpcode::vsldoi128 => fmt_vsldoi128(instr), + PpcOpcode::vpermwi128 => fmt_vpermwi128(instr), + + // ── VMX128 op6 special ───────────────────────────────────────────── + PpcOpcode::vpkd3d128 => fmt_vmx128_pack_d3d(instr, "vpkd3d128"), + PpcOpcode::vrlimi128 => fmt_vmx128_pack_d3d(instr, "vrlimi128"), + PpcOpcode::vrfim128 => fmt_vmx128_unary(instr, "vrfim128"), + PpcOpcode::vrfin128 => fmt_vmx128_unary(instr, "vrfin128"), + PpcOpcode::vrfip128 => fmt_vmx128_unary(instr, "vrfip128"), + PpcOpcode::vrfiz128 => fmt_vmx128_unary(instr, "vrfiz128"), + PpcOpcode::vrefp128 => fmt_vmx128_unary(instr, "vrefp128"), + PpcOpcode::vrsqrtefp128 => fmt_vmx128_unary(instr, "vrsqrtefp128"), + PpcOpcode::vexptefp128 => fmt_vmx128_unary(instr, "vexptefp128"), + PpcOpcode::vlogefp128 => fmt_vmx128_unary(instr, "vlogefp128"), + PpcOpcode::vcfpsxws128 => fmt_vmx128_uimm(instr, "vcfpsxws128"), + PpcOpcode::vcfpuxws128 => fmt_vmx128_uimm(instr, "vcfpuxws128"), + PpcOpcode::vcsxwfp128 => fmt_vmx128_uimm(instr, "vcsxwfp128"), + PpcOpcode::vcuxwfp128 => fmt_vmx128_uimm(instr, "vcuxwfp128"), + PpcOpcode::vspltw128 => fmt_vmx128_uimm(instr, "vspltw128"), + PpcOpcode::vupkd3d128 => fmt_vmx128_uimm(instr, "vupkd3d128"), + PpcOpcode::vspltisw128 => { + let vd = instr.vd128(); + let simm = sign_ext(extract_vx128_uimm5(instr.raw), 5); + base("vspltisw128", format!("{}, {simm}", vr(vd)), 14) + } + PpcOpcode::vcmpeqfp128 => fmt_vmx128_cmp(instr, "vcmpeqfp128"), + PpcOpcode::vcmpgefp128 => fmt_vmx128_cmp(instr, "vcmpgefp128"), + PpcOpcode::vcmpgtfp128 => fmt_vmx128_cmp(instr, "vcmpgtfp128"), + PpcOpcode::vcmpbfp128 => fmt_vmx128_cmp(instr, "vcmpbfp128"), + PpcOpcode::vcmpequw128 => fmt_vmx128_cmp(instr, "vcmpequw128"), + PpcOpcode::vrlw128 => fmt_vmx128_3op(instr, "vrlw128"), + PpcOpcode::vslw128 => fmt_vmx128_3op(instr, "vslw128"), + PpcOpcode::vsraw128 => fmt_vmx128_3op(instr, "vsraw128"), + PpcOpcode::vsrw128 => fmt_vmx128_3op(instr, "vsrw128"), + PpcOpcode::vmaxfp128 => fmt_vmx128_3op(instr, "vmaxfp128"), + PpcOpcode::vminfp128 => fmt_vmx128_3op(instr, "vminfp128"), + PpcOpcode::vmrghw128 => fmt_vmx128_3op(instr, "vmrghw128"), + PpcOpcode::vmrglw128 => fmt_vmx128_3op(instr, "vmrglw128"), + PpcOpcode::vupkhsb128 => fmt_vmx128_3op(instr, "vupkhsb128"), + PpcOpcode::vupklsb128 => fmt_vmx128_3op(instr, "vupklsb128"), + + PpcOpcode::Invalid => long_word(instr.raw), + } +} /// Disassemble a decoded instruction into PPC assembly text. +/// +/// Back-compat entry point: returns the same single-string the legacy +/// formatter produced, preferring the extended form when present. pub fn disassemble(instr: &DecodedInstr) -> String { - let mut out = String::new(); - match instr.opcode { - // Branch instructions - PpcOpcode::bx => { - let target = if instr.aa() { - instr.li() as u32 - } else { - instr.addr.wrapping_add(instr.li() as u32) - }; - let mnemonic = if instr.lk() { "bl" } else { "b" }; - write!(out, "{} 0x{:08X}", mnemonic, target).unwrap(); - } - PpcOpcode::bcx => { - let bo = instr.bo(); - let bi = instr.bi(); - let target = if instr.aa() { - instr.bd() as u32 - } else { - instr.addr.wrapping_add(instr.bd() as u32) - }; - let mnemonic = if instr.lk() { "bcl" } else { "bc" }; - write!(out, "{} {},{},0x{:08X}", mnemonic, bo, bi, target).unwrap(); - } - PpcOpcode::bclrx => { - let mnemonic = if instr.lk() { "bclrl" } else { "bclr" }; - write!(out, "{} {},{}", mnemonic, instr.bo(), instr.bi()).unwrap(); - } - PpcOpcode::bcctrx => { - let mnemonic = if instr.lk() { "bcctrl" } else { "bcctr" }; - write!(out, "{} {},{}", mnemonic, instr.bo(), instr.bi()).unwrap(); - } - - // System call - PpcOpcode::sc => { - write!(out, "sc").unwrap(); - } - - // D-form load/store - PpcOpcode::lwz | PpcOpcode::lwzu | PpcOpcode::lbz | PpcOpcode::lbzu | - PpcOpcode::lhz | PpcOpcode::lhzu | PpcOpcode::lha | PpcOpcode::lhau | - PpcOpcode::lfs | PpcOpcode::lfsu | PpcOpcode::lfd | PpcOpcode::lfdu => { - write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rd(), instr.d(), instr.ra()).unwrap(); - } - PpcOpcode::stw | PpcOpcode::stwu | PpcOpcode::stb | PpcOpcode::stbu | - PpcOpcode::sth | PpcOpcode::sthu | - PpcOpcode::stfs | PpcOpcode::stfsu | PpcOpcode::stfd | PpcOpcode::stfdu => { - write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rs(), instr.d(), instr.ra()).unwrap(); - } - - // D-form immediate ALU - PpcOpcode::addi | PpcOpcode::addis | PpcOpcode::addic | PpcOpcode::addicx | - PpcOpcode::subficx | PpcOpcode::mulli => { - write!(out, "{:?} r{},r{},{}", instr.opcode, instr.rd(), instr.ra(), instr.simm16()).unwrap(); - } - - // D-form immediate logical - PpcOpcode::ori | PpcOpcode::oris | PpcOpcode::xori | PpcOpcode::xoris | - PpcOpcode::andix | PpcOpcode::andisx => { - write!(out, "{:?} r{},r{},0x{:04X}", instr.opcode, instr.ra(), instr.rs(), instr.uimm16()).unwrap(); - } - - // Compare - PpcOpcode::cmpi => { - write!(out, "cmp{}i cr{},r{},{}", if instr.l() { "d" } else { "w" }, - instr.crfd(), instr.ra(), instr.simm16()).unwrap(); - } - PpcOpcode::cmpli => { - write!(out, "cmpl{}i cr{},r{},0x{:04X}", if instr.l() { "d" } else { "w" }, - instr.crfd(), instr.ra(), instr.uimm16()).unwrap(); - } - PpcOpcode::cmp => { - write!(out, "cmp{} cr{},r{},r{}", if instr.l() { "d" } else { "w" }, - instr.crfd(), instr.ra(), instr.rb()).unwrap(); - } - PpcOpcode::cmpl => { - write!(out, "cmpl{} cr{},r{},r{}", if instr.l() { "d" } else { "w" }, - instr.crfd(), instr.ra(), instr.rb()).unwrap(); - } - - // X-form ALU (3-register) - PpcOpcode::addx | PpcOpcode::addcx | PpcOpcode::addex | PpcOpcode::addzex | - PpcOpcode::addmex | PpcOpcode::subfx | PpcOpcode::subfcx | PpcOpcode::subfex | - PpcOpcode::subfzex | PpcOpcode::subfmex | PpcOpcode::negx | - PpcOpcode::mullwx | PpcOpcode::mulhwx | PpcOpcode::mulhwux | - PpcOpcode::divwx | PpcOpcode::divwux | - PpcOpcode::mulldx | PpcOpcode::mulhdx | PpcOpcode::mulhdux | - PpcOpcode::divdx | PpcOpcode::divdux => { - write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.rd(), instr.ra(), instr.rb()).unwrap(); - } - - // X-form logical - PpcOpcode::andx | PpcOpcode::andcx | PpcOpcode::orx | PpcOpcode::orcx | - PpcOpcode::xorx | PpcOpcode::norx | PpcOpcode::nandx | PpcOpcode::eqvx => { - write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.ra(), instr.rs(), instr.rb()).unwrap(); - } - - // Shift/rotate - PpcOpcode::slwx | PpcOpcode::srwx | PpcOpcode::srawx | PpcOpcode::sldx | - PpcOpcode::srdx | PpcOpcode::sradx => { - write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.ra(), instr.rs(), instr.rb()).unwrap(); - } - PpcOpcode::srawix => { - write!(out, "srawi r{},r{},{}", instr.ra(), instr.rs(), instr.sh()).unwrap(); - } - PpcOpcode::sradix => { - write!(out, "sradi r{},r{},{}", instr.ra(), instr.rs(), instr.sh64()).unwrap(); - } - - // Rotate - PpcOpcode::rlwinmx => { - write!(out, "rlwinm r{},r{},{},{},{}", instr.ra(), instr.rs(), instr.sh(), instr.mb(), instr.me()).unwrap(); - } - PpcOpcode::rlwimix => { - write!(out, "rlwimi r{},r{},{},{},{}", instr.ra(), instr.rs(), instr.sh(), instr.mb(), instr.me()).unwrap(); - } - PpcOpcode::rlwnmx => { - write!(out, "rlwnm r{},r{},r{},{},{}", instr.ra(), instr.rs(), instr.rb(), instr.mb(), instr.me()).unwrap(); - } - - // Special register moves - PpcOpcode::mfspr => { - let spr_name = match instr.spr() { - 1 => "xer", - 8 => "lr", - 9 => "ctr", - 268 => "tbl", - 269 => "tbu", - _ => "", - }; - if spr_name.is_empty() { - write!(out, "mfspr r{},{}", instr.rd(), instr.spr()).unwrap(); - } else { - write!(out, "mf{} r{}", spr_name, instr.rd()).unwrap(); - } - } - PpcOpcode::mtspr => { - let spr_name = match instr.spr() { - 1 => "xer", - 8 => "lr", - 9 => "ctr", - _ => "", - }; - if spr_name.is_empty() { - write!(out, "mtspr {},r{}", instr.spr(), instr.rs()).unwrap(); - } else { - write!(out, "mt{} r{}", spr_name, instr.rs()).unwrap(); - } - } - PpcOpcode::mfcr => { - write!(out, "mfcr r{}", instr.rd()).unwrap(); - } - PpcOpcode::mtcrf => { - write!(out, "mtcrf 0x{:02X},r{}", instr.crm(), instr.rs()).unwrap(); - } - - // Extend - PpcOpcode::extsbx => write!(out, "extsb r{},r{}", instr.ra(), instr.rs()).unwrap(), - PpcOpcode::extshx => write!(out, "extsh r{},r{}", instr.ra(), instr.rs()).unwrap(), - PpcOpcode::extswx => write!(out, "extsw r{},r{}", instr.ra(), instr.rs()).unwrap(), - PpcOpcode::cntlzwx => write!(out, "cntlzw r{},r{}", instr.ra(), instr.rs()).unwrap(), - PpcOpcode::cntlzdx => write!(out, "cntlzd r{},r{}", instr.ra(), instr.rs()).unwrap(), - - // X-form load/store - PpcOpcode::lwzx | PpcOpcode::lwzux | PpcOpcode::lbzx | PpcOpcode::lbzux | - PpcOpcode::lhzx | PpcOpcode::lhzux | PpcOpcode::lhax | PpcOpcode::lhaux | - PpcOpcode::lwax | PpcOpcode::lwaux | PpcOpcode::ldx | PpcOpcode::ldux | - PpcOpcode::lfsx | PpcOpcode::lfsux | PpcOpcode::lfdx | PpcOpcode::lfdux | - PpcOpcode::lwbrx | PpcOpcode::lhbrx | PpcOpcode::ldbrx | - PpcOpcode::lwarx | PpcOpcode::ldarx => { - write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.rd(), instr.ra(), instr.rb()).unwrap(); - } - PpcOpcode::stwx | PpcOpcode::stwux | PpcOpcode::stbx | PpcOpcode::stbux | - PpcOpcode::sthx | PpcOpcode::sthux | PpcOpcode::stdx | PpcOpcode::stdux | - PpcOpcode::stfsx | PpcOpcode::stfsux | PpcOpcode::stfdx | PpcOpcode::stfdux | - PpcOpcode::stwbrx | PpcOpcode::sthbrx | PpcOpcode::stdbrx | - PpcOpcode::stwcx | PpcOpcode::stdcx | PpcOpcode::stfiwx => { - write!(out, "{:?} r{},r{},r{}", instr.opcode, instr.rs(), instr.ra(), instr.rb()).unwrap(); - } - - // Cache/sync ops (no-ops for interpreter) - PpcOpcode::dcbf | PpcOpcode::dcbi | PpcOpcode::dcbst | - PpcOpcode::dcbt | PpcOpcode::dcbtst | PpcOpcode::icbi => { - write!(out, "{:?} r{},r{}", instr.opcode, instr.ra(), instr.rb()).unwrap(); - } - PpcOpcode::dcbz | PpcOpcode::dcbz128 => { - write!(out, "{:?} r{},r{}", instr.opcode, instr.ra(), instr.rb()).unwrap(); - } - PpcOpcode::sync | PpcOpcode::eieio | PpcOpcode::isync => { - write!(out, "{:?}", instr.opcode).unwrap(); - } - - // Load/store multiple - PpcOpcode::lmw => write!(out, "lmw r{},{}(r{})", instr.rd(), instr.d(), instr.ra()).unwrap(), - PpcOpcode::stmw => write!(out, "stmw r{},{}(r{})", instr.rs(), instr.d(), instr.ra()).unwrap(), - - // DS-form loads/stores - PpcOpcode::ld | PpcOpcode::ldu | PpcOpcode::lwa => { - write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rd(), instr.ds(), instr.ra()).unwrap(); - } - PpcOpcode::std | PpcOpcode::stdu => { - write!(out, "{:?} r{},{}(r{})", instr.opcode, instr.rs(), instr.ds(), instr.ra()).unwrap(); - } - - // CR logical ops - PpcOpcode::crand | PpcOpcode::crandc | PpcOpcode::creqv | PpcOpcode::crnand | - PpcOpcode::crnor | PpcOpcode::cror | PpcOpcode::crorc | PpcOpcode::crxor => { - write!(out, "{:?} {},{},{}", instr.opcode, instr.crbd(), instr.crba(), instr.crbb()).unwrap(); - } - PpcOpcode::mcrf => { - write!(out, "mcrf cr{},cr{}", instr.crfd(), instr.crfs()).unwrap(); - } - - // Trap - PpcOpcode::tdi => write!(out, "tdi {},r{},{}", instr.rd(), instr.ra(), instr.simm16()).unwrap(), - PpcOpcode::twi => write!(out, "twi {},r{},{}", instr.rd(), instr.ra(), instr.simm16()).unwrap(), - PpcOpcode::td => write!(out, "td {},r{},r{}", instr.rd(), instr.ra(), instr.rb()).unwrap(), - PpcOpcode::tw => write!(out, "tw {},r{},r{}", instr.rd(), instr.ra(), instr.rb()).unwrap(), - - // Default: just print opcode and raw hex - _ => { - write!(out, "{:?} [{:08X}]", instr.opcode, instr.raw).unwrap(); - } - } - out + format(instr).display().to_string() } /// Disassemble a range of instructions from a byte slice. @@ -239,38 +633,1235 @@ pub fn disassemble_block(data: &[u8], base_addr: u32, count: usize) -> Vec<(u32, break; } let raw = u32::from_be_bytes([ - data[offset], - data[offset + 1], - data[offset + 2], - data[offset + 3], + data[offset], data[offset + 1], data[offset + 2], data[offset + 3], ]); let addr = base_addr + offset as u32; - let instr = crate::decode(raw, addr); + let instr = crate::decoder::decode(raw, addr); let text = disassemble(&instr); result.push((addr, text)); } result } +/// One yielded instruction from [`iter_disasm`]. Carries the absolute VA, +/// raw word, decoded opcode and the formatted text — everything a sink +/// needs to render or persist a single row without re-parsing. +#[derive(Debug, Clone)] +pub struct DisasmItem { + pub addr: u32, + pub raw: u32, + pub opcode: PpcOpcode, + pub text: DisasmText, +} + +/// Iterate over instructions in the VA range `[va_start, va_end)` of an +/// image-mapped byte slice. `image[rva]` must hold the byte at absolute VA +/// `image_base + rva` (the layout produced by [`xenia_xex::loader`]). +/// +/// Stops on a truncated tail (less than 4 bytes remaining at the cursor). +/// Yields nothing if `va_start >= va_end` or the start RVA is beyond the +/// image. +pub fn iter_disasm( + image: &[u8], + image_base: u32, + va_start: u32, + va_end: u32, +) -> impl Iterator + '_ { + DisasmIter { image, image_base, va: va_start, end: va_end } +} + +struct DisasmIter<'a> { + image: &'a [u8], + image_base: u32, + va: u32, + end: u32, +} + +impl Iterator for DisasmIter<'_> { + type Item = DisasmItem; + #[inline] + fn next(&mut self) -> Option { + if self.va >= self.end { + return None; + } + let rva = self.va.wrapping_sub(self.image_base) as usize; + if rva + 4 > self.image.len() { + return None; + } + let raw = u32::from_be_bytes([ + self.image[rva], + self.image[rva + 1], + self.image[rva + 2], + self.image[rva + 3], + ]); + let abs = self.va; + let decoded = crate::decoder::decode(raw, abs); + let text = format(&decoded); + self.va = self.va.wrapping_add(4); + Some(DisasmItem { addr: abs, raw, opcode: decoded.opcode, text }) + } +} + +// ── Per-class formatters ─────────────────────────────────────────────────── + +fn opcode_name(op: PpcOpcode) -> &'static str { + // Used for VMX where the enum variant name matches the canonical mnemonic. + // For ALU/FPU variants ending in "x", use hardcoded strings instead. + match op { + PpcOpcode::vaddubm => "vaddubm", PpcOpcode::vmaxub => "vmaxub", PpcOpcode::vrlb => "vrlb", + PpcOpcode::vmuloub => "vmuloub", PpcOpcode::vaddfp => "vaddfp", PpcOpcode::vmrghb => "vmrghb", + PpcOpcode::vpkuhum => "vpkuhum", PpcOpcode::vadduhm => "vadduhm", PpcOpcode::vmaxuh => "vmaxuh", + PpcOpcode::vrlh => "vrlh", PpcOpcode::vmulouh => "vmulouh", PpcOpcode::vsubfp => "vsubfp", + PpcOpcode::vmrghh => "vmrghh", PpcOpcode::vpkuwum => "vpkuwum", + PpcOpcode::vadduwm => "vadduwm", PpcOpcode::vmaxuw => "vmaxuw", PpcOpcode::vrlw => "vrlw", + PpcOpcode::vmrghw => "vmrghw", PpcOpcode::vpkuhus => "vpkuhus", PpcOpcode::vpkuwus => "vpkuwus", + PpcOpcode::vmaxsb => "vmaxsb", PpcOpcode::vslb => "vslb", PpcOpcode::vmulosb => "vmulosb", + PpcOpcode::vmrglb => "vmrglb", PpcOpcode::vpkshus => "vpkshus", PpcOpcode::vmaxsh => "vmaxsh", + PpcOpcode::vslh => "vslh", PpcOpcode::vmulosh => "vmulosh", PpcOpcode::vmrglh => "vmrglh", + PpcOpcode::vpkswus => "vpkswus", PpcOpcode::vaddcuw => "vaddcuw", PpcOpcode::vmaxsw => "vmaxsw", + PpcOpcode::vslw => "vslw", PpcOpcode::vmrglw => "vmrglw", PpcOpcode::vpkshss => "vpkshss", + PpcOpcode::vsl => "vsl", PpcOpcode::vpkswss => "vpkswss", + PpcOpcode::vaddubs => "vaddubs", PpcOpcode::vminub => "vminub", PpcOpcode::vsrb => "vsrb", + PpcOpcode::vmuleub => "vmuleub", PpcOpcode::vadduhs => "vadduhs", PpcOpcode::vminuh => "vminuh", + PpcOpcode::vsrh => "vsrh", PpcOpcode::vmuleuh => "vmuleuh", + PpcOpcode::vadduws => "vadduws", PpcOpcode::vminuw => "vminuw", PpcOpcode::vsrw => "vsrw", + PpcOpcode::vsr => "vsr", + PpcOpcode::vaddsbs => "vaddsbs", PpcOpcode::vminsb => "vminsb", PpcOpcode::vsrab => "vsrab", + PpcOpcode::vmulesb => "vmulesb", PpcOpcode::vpkpx => "vpkpx", + PpcOpcode::vaddshs => "vaddshs", PpcOpcode::vminsh => "vminsh", PpcOpcode::vsrah => "vsrah", + PpcOpcode::vmulesh => "vmulesh", + PpcOpcode::vaddsws => "vaddsws", PpcOpcode::vminsw => "vminsw", PpcOpcode::vsraw => "vsraw", + PpcOpcode::vsububm => "vsububm", PpcOpcode::vavgub => "vavgub", PpcOpcode::vand => "vand", + PpcOpcode::vmaxfp => "vmaxfp", PpcOpcode::vslo => "vslo", + PpcOpcode::vsubuhm => "vsubuhm", PpcOpcode::vavguh => "vavguh", PpcOpcode::vandc => "vandc", + PpcOpcode::vminfp => "vminfp", PpcOpcode::vsro => "vsro", + PpcOpcode::vsubuwm => "vsubuwm", PpcOpcode::vavguw => "vavguw", PpcOpcode::vor => "vor", + PpcOpcode::vxor => "vxor", PpcOpcode::vavgsb => "vavgsb", PpcOpcode::vnor => "vnor", + PpcOpcode::vavgsh => "vavgsh", PpcOpcode::vsubcuw => "vsubcuw", PpcOpcode::vavgsw => "vavgsw", + PpcOpcode::vsububs => "vsububs", PpcOpcode::vsum4ubs => "vsum4ubs", + PpcOpcode::vsubuhs => "vsubuhs", PpcOpcode::vsum4shs => "vsum4shs", + PpcOpcode::vsubuws => "vsubuws", PpcOpcode::vsum2sws => "vsum2sws", + PpcOpcode::vsubsbs => "vsubsbs", PpcOpcode::vsum4sbs => "vsum4sbs", + PpcOpcode::vsubshs => "vsubshs", PpcOpcode::vsubsws => "vsubsws", + PpcOpcode::vsumsws => "vsumsws", + + PpcOpcode::vrefp => "vrefp", PpcOpcode::vrsqrtefp => "vrsqrtefp", + PpcOpcode::vexptefp => "vexptefp", PpcOpcode::vlogefp => "vlogefp", + PpcOpcode::vrfin => "vrfin", PpcOpcode::vrfiz => "vrfiz", + PpcOpcode::vrfip => "vrfip", PpcOpcode::vrfim => "vrfim", + PpcOpcode::vupkhsb => "vupkhsb", PpcOpcode::vupkhsh => "vupkhsh", + PpcOpcode::vupklsb => "vupklsb", PpcOpcode::vupklsh => "vupklsh", + PpcOpcode::vupkhpx => "vupkhpx", PpcOpcode::vupklpx => "vupklpx", + + PpcOpcode::vspltb => "vspltb", PpcOpcode::vsplth => "vsplth", PpcOpcode::vspltw => "vspltw", + PpcOpcode::vcfux => "vcfux", PpcOpcode::vcfsx => "vcfsx", + PpcOpcode::vctuxs => "vctuxs", PpcOpcode::vctsxs => "vctsxs", + + PpcOpcode::vcmpequb => "vcmpequb", PpcOpcode::vcmpequh => "vcmpequh", + PpcOpcode::vcmpequw => "vcmpequw", PpcOpcode::vcmpeqfp => "vcmpeqfp", + PpcOpcode::vcmpgefp => "vcmpgefp", PpcOpcode::vcmpgtub => "vcmpgtub", + PpcOpcode::vcmpgtuh => "vcmpgtuh", PpcOpcode::vcmpgtuw => "vcmpgtuw", + PpcOpcode::vcmpgtfp => "vcmpgtfp", PpcOpcode::vcmpgtsb => "vcmpgtsb", + PpcOpcode::vcmpgtsh => "vcmpgtsh", PpcOpcode::vcmpgtsw => "vcmpgtsw", + PpcOpcode::vcmpbfp => "vcmpbfp", + + PpcOpcode::vmhaddshs => "vmhaddshs", PpcOpcode::vmhraddshs => "vmhraddshs", + PpcOpcode::vmladduhm => "vmladduhm", + PpcOpcode::vmsumubm => "vmsumubm", PpcOpcode::vmsummbm => "vmsummbm", + PpcOpcode::vmsumuhm => "vmsumuhm", PpcOpcode::vmsumuhs => "vmsumuhs", + PpcOpcode::vmsumshm => "vmsumshm", PpcOpcode::vmsumshs => "vmsumshs", + PpcOpcode::vsel => "vsel", PpcOpcode::vperm => "vperm", + _ => "?", + } +} + +// Branches (I-form: b/bl/ba/bla) — produces base + extended forms. +fn fmt_b(instr: &DecodedInstr) -> DisasmText { + let aa = instr.aa(); + let lk = instr.lk(); + let target = if aa { instr.li() as u32 } + else { instr.addr.wrapping_add(instr.li() as u32) }; + let mnem = match (aa, lk) { + (false, false) => "b", + (false, true) => "bl", + (true, false) => "ba", + (true, true) => "bla", + }; + let ops = format!("0x{target:08X}"); + with_target(base(mnem, ops, 8), target) +} + +fn fmt_bc(instr: &DecodedInstr) -> DisasmText { + let bo = instr.bo(); + let bi = instr.bi(); + let aa = instr.aa(); + let lk = instr.lk(); + let target = if aa { instr.bd() as u32 } + else { instr.addr.wrapping_add(instr.bd() as u32) }; + + let a = if aa { "a" } else { "" }; + let l = if lk { "l" } else { "" }; + let base_mnem = format!("bc{a}{l}"); + let base_ops = format!("{bo}, {}, 0x{target:08X}", crb(bi)); + + // Extended forms. + let cr_field = bi / 4; + let cr_bit = bi % 4; + let decr = bo & 0x04 == 0; + let uncond = bo & 0x10 != 0; + + let result = if uncond && !decr { + // Unconditional branch. + let ext_mnem = format!("b{a}{l}"); + let ext_ops = format!("0x{target:08X}"); + with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8) + } else { + let cond_true = bo & 0x08 != 0; + let cond_name_opt: Option<&'static str> = match (cr_bit, cond_true) { + (0, true) => Some("lt"), (0, false) => Some("ge"), + (1, true) => Some("gt"), (1, false) => Some("le"), + (2, true) => Some("eq"), (2, false) => Some("ne"), + (3, true) => Some("so"), (3, false) => Some("ns"), + _ => None, + }; + let cr = if cr_field == 0 { String::new() } else { format!("cr{cr_field}, ") }; + + if decr { + let z = if bo & 0x02 != 0 { "z" } else { "nz" }; + let cond_str = cond_name_opt.unwrap_or(""); + let ext_mnem = format!("bd{z}{cond_str}{a}{l}"); + let ext_ops = format!("{cr}0x{target:08X}"); + with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8) + } else if let Some(cond_name) = cond_name_opt { + let ext_mnem = format!("b{cond_name}{a}{l}"); + let ext_ops = format!("{cr}0x{target:08X}"); + with_ext(&base_mnem, base_ops, 8, &ext_mnem, ext_ops, 8) + } else { + base(&base_mnem, base_ops, 8) + } + }; + with_target(result, target) +} + +fn fmt_bclr(instr: &DecodedInstr) -> DisasmText { + let bo = instr.bo(); + let bi = instr.bi(); + let lk = instr.lk(); + let l = if lk { "l" } else { "" }; + let base_mnem = format!("bclr{l}"); + let base_ops = format!("{bo}, {}", crb(bi)); + + // BO=20 (binary 10100) sets both "ignore CTR" and "ignore CR" bits, making + // the branch unconditional regardless of BI. BI is don't-care by spec, so + // the simplified `blr`/`blrl` form applies for any BI value. + if bo == 20 { + let ext = if lk { "blrl" } else { "blr" }; + return with_ext(&base_mnem, base_ops, 8, ext, String::new(), 0); + } + if let Some((cond, cr)) = cond_branch_ext(bo, bi) { + let cr_no_comma = cr.trim_end_matches(", "); + let ext_mnem = format!("b{cond}lr{l}"); + if cr_no_comma.is_empty() { + return with_ext(&base_mnem, base_ops, 8, &ext_mnem, String::new(), 0); + } else { + return with_ext(&base_mnem, base_ops, 8, &ext_mnem, cr_no_comma.to_string(), 8); + } + } + let decr = bo & 0x04 == 0; + let uncond = bo & 0x10 != 0; + if decr && uncond { + let z = if bo & 0x02 != 0 { "z" } else { "nz" }; + let ext_mnem = format!("bd{z}lr{l}"); + return with_ext(&base_mnem, base_ops, 8, &ext_mnem, String::new(), 0); + } + base(&base_mnem, base_ops, 8) +} + +fn fmt_bcctr(instr: &DecodedInstr) -> DisasmText { + let bo = instr.bo(); + let bi = instr.bi(); + let lk = instr.lk(); + let l = if lk { "l" } else { "" }; + let base_mnem = format!("bcctr{l}"); + let base_ops = format!("{bo}, {}", crb(bi)); + + // BO=20 unconditional pattern: BI is don't-care (see fmt_bclr). + if bo == 20 { + let ext = if lk { "bctrl" } else { "bctr" }; + return with_ext(&base_mnem, base_ops, 8, ext, String::new(), 0); + } + if let Some((cond, cr)) = cond_branch_ext(bo, bi) { + let cr_no_comma = cr.trim_end_matches(", "); + let ext_mnem = format!("b{cond}ctr{l}"); + if cr_no_comma.is_empty() { + return with_ext(&base_mnem, base_ops, 8, &ext_mnem, String::new(), 0); + } else { + return with_ext(&base_mnem, base_ops, 8, &ext_mnem, cr_no_comma.to_string(), 8); + } + } + base(&base_mnem, base_ops, 8) +} + +// Trap immediate / register +fn fmt_trap_imm(instr: &DecodedInstr, mnem: &str, simplified_prefix: &str) -> DisasmText { + let to = instr.to(); + let ra = instr.ra(); + let imm = instr.simm16() as i32; + let base_ops = format!("{to}, {}, {imm}", gpr(ra)); + if let Some(cond) = trap_cond(to) { + if cond.is_empty() { + base(mnem, base_ops, 8) + } else { + let ext_mnem = format!("{simplified_prefix}{cond}i"); + let ext_ops = format!("{}, {imm}", gpr(ra)); + with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8) + } + } else { + base(mnem, base_ops, 8) + } +} + +fn fmt_trap_reg(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let to = instr.to(); + let ra = instr.ra(); + let rb = instr.rb(); + let base_ops = format!("{to}, {}, {}", gpr(ra), gpr(rb)); + if to == 31 && ra == 0 && rb == 0 { + return with_ext(mnem, base_ops, 8, "trap", String::new(), 0); + } + if let Some(cond) = trap_cond(to) + && !cond.is_empty() + { + let ext_mnem = format!("{mnem}{cond}"); + let ext_ops = format!("{}, {}", gpr(ra), gpr(rb)); + return with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8); + } + base(mnem, base_ops, 8) +} + +// D-form ALU +fn fmt_addi(instr: &DecodedInstr) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let imm = instr.simm16() as i32; + let base_ops = format!("{}, {}, {imm}", gpr(rt), gpr(ra)); + if ra == 0 { + with_ext("addi", base_ops, 8, "li", format!("{}, {imm}", gpr(rt)), 8) + } else if imm < 0 { + with_ext("addi", base_ops, 8, "subi", format!("{}, {}, {}", gpr(rt), gpr(ra), -imm), 8) + } else { + base("addi", base_ops, 8) + } +} + +fn fmt_addis(instr: &DecodedInstr) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let imm = instr.simm16() as i32; + let imm_u = imm as u16 as u32; + let base_ops = format!("{}, {}, 0x{imm_u:X}", gpr(rt), gpr(ra)); + if ra == 0 { + with_ext("addis", base_ops, 8, "lis", format!("{}, 0x{imm_u:X}", gpr(rt)), 8) + } else if imm < 0 { + let neg = (-imm) as u16 as u32; + with_ext("addis", base_ops, 8, "subis", format!("{}, {}, 0x{neg:X}", gpr(rt), gpr(ra)), 8) + } else { + base("addis", base_ops, 8) + } +} + +fn fmt_d_add(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let imm = instr.simm16() as i32; + let base_ops = format!("{}, {}, {imm}", gpr(rt), gpr(ra)); + if imm < 0 { + let ext_mnem = mnem.replace("addic", "subic"); + with_ext(mnem, base_ops, 8, &ext_mnem, format!("{}, {}, {}", gpr(rt), gpr(ra), -imm), 8) + } else { + base(mnem, base_ops, 8) + } +} + +fn fmt_d_imm_simple(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let imm = instr.simm16() as i32; + base(mnem, format!("{}, {}, {imm}", gpr(rt), gpr(ra)), 8) +} + +fn fmt_cmp_imm(instr: &DecodedInstr, mnem: &str, signed: bool) -> DisasmText { + let bf = instr.crfd(); + let l_bit = if instr.l() { 1 } else { 0 }; + let ra = instr.ra(); + let imm_str = if signed { + format!("{}", instr.simm16() as i32) + } else { + format!("0x{:X}", instr.uimm16()) + }; + let cr = if bf == 0 { String::new() } else { format!("cr{bf}, ") }; + let base_ops = format!("{cr}{l_bit}, {}, {imm_str}", gpr(ra)); + + let size = if l_bit == 0 { "w" } else { "d" }; + let ext_mnem = if mnem == "cmpi" { + format!("cmp{size}i") + } else { + format!("cmpl{size}i") + }; + let ext_ops = format!("{cr}{}, {imm_str}", gpr(ra)); + with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8) +} + +fn fmt_cmp_reg(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let bf = instr.crfd(); + let l_bit = if instr.l() { 1 } else { 0 }; + let ra = instr.ra(); + let rb = instr.rb(); + let cr = if bf == 0 { String::new() } else { format!("cr{bf}, ") }; + let base_ops = format!("{cr}{l_bit}, {}, {}", gpr(ra), gpr(rb)); + let size = if l_bit == 0 { "w" } else { "d" }; + let ext_mnem = format!("{mnem}{size}"); + let ext_ops = format!("{cr}{}, {}", gpr(ra), gpr(rb)); + with_ext(mnem, base_ops, 8, &ext_mnem, ext_ops, 8) +} + +fn fmt_ori(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let uimm = instr.uimm16() as u32; + let base_ops = format!("{}, {}, 0x{uimm:X}", gpr(ra), gpr(rs)); + if rs == 0 && ra == 0 && uimm == 0 { + with_ext("ori", base_ops, 8, "nop", String::new(), 0) + } else { + base("ori", base_ops, 8) + } +} + +fn fmt_d_logic(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let uimm = instr.uimm16() as u32; + base(mnem, format!("{}, {}, 0x{uimm:X}", gpr(ra), gpr(rs)), 8) +} + +// D-form load/store. `is_fpr` selects between fX and rX for the data register. +fn fmt_ld(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let d = instr.d(); + let rn = if is_fpr { fpr(rt) } else { gpr(rt) }; + base(mnem, format!("{rn}, {d}({})", gpr(ra)), 8) +} + +fn fmt_st(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let d = instr.d(); + let rn = if is_fpr { fpr(rs) } else { gpr(rs) }; + base(mnem, format!("{rn}, {d}({})", gpr(ra)), 8) +} + +// DS-form load/store. +fn fmt_ds(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let r = instr.rd(); + let ra = instr.ra(); + let ds = instr.ds(); + base(mnem, format!("{}, {ds}({})", gpr(r), gpr(ra)), 8) +} + +// Rotate (32-bit). +fn fmt_rlwimi(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let sh = instr.sh(); + let mb = instr.mb(); + let me = instr.me(); + let rc = rc_dot(instr); + let mnem = format!("rlwimi{rc}"); + let base_ops = format!("{}, {}, {sh}, {mb}, {me}", gpr(ra), gpr(rs)); + // inslwi rA, rS, n, b = rlwimi rA, rS, 32-b, b, b+n-1 + if mb <= me && sh == (32u32.wrapping_sub(mb)) % 32 && sh != 31u32.wrapping_sub(me) { + let n = me - mb + 1; + let b = mb; + let ext_mnem = format!("inslwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext_mnem, format!("{}, {}, {n}, {b}", gpr(ra), gpr(rs)), 8); + } + // insrwi rA, rS, n, b = rlwimi rA, rS, 32-(b+n), b, b+n-1 + if mb <= me && sh == 31u32.wrapping_sub(me) % 32 { + let n = me - mb + 1; + let b = mb; + let ext_mnem = format!("insrwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext_mnem, format!("{}, {}, {n}, {b}", gpr(ra), gpr(rs)), 8); + } + base(&mnem, base_ops, 8) +} + +fn fmt_rlwinm(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let sh = instr.sh(); + let mb = instr.mb(); + let me = instr.me(); + let rc = rc_dot(instr); + let mnem = format!("rlwinm{rc}"); + let base_ops = format!("{}, {}, {sh}, {mb}, {me}", gpr(ra), gpr(rs)); + + // Priority-ordered simplified forms. + if sh > 0 && mb == 0 && me == 31 - sh { + let ext = format!("slwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8); + } + if sh > 0 && me == 31 && sh + mb == 32 { + let ext = format!("srwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), 32 - sh), 8); + } + if sh > 0 && mb == 0 && me == 31 { + let ext = format!("rotlwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8); + } + if sh == 0 && me == 31 && mb > 0 { + let ext = format!("clrlwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {mb}", gpr(ra), gpr(rs)), 8); + } + if sh == 0 && mb == 0 && me < 31 { + let ext = format!("clrrwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), 31 - me), 8); + } + if mb == 0 && sh > 0 && me < 31 { + let n = me + 1; + let ext = format!("extlwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {n}, {sh}", gpr(ra), gpr(rs)), 8); + } + if me == 31 && mb > 0 && sh > 0 { + let n = 32 - mb; + let b = sh.wrapping_sub(n) % 32; + let ext = format!("extrwi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {n}, {b}", gpr(ra), gpr(rs)), 8); + } + base(&mnem, base_ops, 8) +} + +fn fmt_rlwnm(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let mb = instr.mb(); + let me = instr.me(); + let rc = rc_dot(instr); + let mnem = format!("rlwnm{rc}"); + let base_ops = format!("{}, {}, {}, {mb}, {me}", gpr(ra), gpr(rs), gpr(rb)); + if mb == 0 && me == 31 { + let ext = format!("rotlw{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)), 8); + } + base(&mnem, base_ops, 8) +} + +// 64-bit MD/MDS-form rotate. +fn fmt_rldicl(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rc = rc_dot(instr); + let sh = instr.sh64(); + let mb = mb_md(instr.raw); + let mnem = format!("rldicl{rc}"); + let base_ops = format!("{}, {}, {sh}, {mb}", gpr(ra), gpr(rs)); + if sh == 0 && mb > 0 { + let ext = format!("clrldi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {mb}", gpr(ra), gpr(rs)), 8); + } + if mb > 0 && sh == (64u32.wrapping_sub(mb)) & 63 { + let ext = format!("srdi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {mb}", gpr(ra), gpr(rs)), 8); + } + if sh > 0 && mb == 0 { + let ext = format!("rotldi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8); + } + base(&mnem, base_ops, 8) +} + +fn fmt_rldicr(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rc = rc_dot(instr); + let sh = instr.sh64(); + let me = mb_md(instr.raw); + let mnem = format!("rldicr{rc}"); + let base_ops = format!("{}, {}, {sh}, {me}", gpr(ra), gpr(rs)); + if sh > 0 && me == (63u32.wrapping_sub(sh)) & 63 { + let ext = format!("sldi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8); + } + if sh == 0 && me < 63 { + let ext = format!("clrrdi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), 63 - me), 8); + } + base(&mnem, base_ops, 8) +} + +fn fmt_rldic(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rc = rc_dot(instr); + let sh = instr.sh64(); + let mb = mb_md(instr.raw); + base(&format!("rldic{rc}"), format!("{}, {}, {sh}, {mb}", gpr(ra), gpr(rs)), 8) +} + +fn fmt_rldimi(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rc = rc_dot(instr); + let sh = instr.sh64(); + let mb = mb_md(instr.raw); + let mnem = format!("rldimi{rc}"); + let base_ops = format!("{}, {}, {sh}, {mb}", gpr(ra), gpr(rs)); + if mb > 0 { + let n = (64u32.wrapping_sub(sh).wrapping_sub(mb)) & 63; + if n > 0 { + let ext = format!("insrdi{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {n}, {mb}", gpr(ra), gpr(rs)), 8); + } + } + base(&mnem, base_ops, 8) +} + +fn fmt_rldcl(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let mb = mb_md(instr.raw); + let mnem = format!("rldcl{rc}"); + let base_ops = format!("{}, {}, {}, {mb}", gpr(ra), gpr(rs), gpr(rb)); + if mb == 0 { + let ext = format!("rotld{rc}"); + return with_ext(&mnem, base_ops, 8, &ext, format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)), 8); + } + base(&mnem, base_ops, 8) +} + +fn fmt_rldcr(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let me = mb_md(instr.raw); + base(&format!("rldcr{rc}"), format!("{}, {}, {}, {me}", gpr(ra), gpr(rs), gpr(rb)), 8) +} + +/// MD/MDS-form mb/me field: 6 bits packed as bits 21-25 + bit 26 (low bit). +#[inline] +fn mb_md(raw: u32) -> u32 { + let lo5 = (raw >> 6) & 0x1F; // bits 21-25 + let hi = (raw >> 5) & 0x1; // bit 26 + lo5 | (hi << 5) +} + +// XO-form ALU +fn fmt_xo_3op(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let oe = if instr.oe() { "o" } else { "" }; + let full = format!("{mnem}{oe}{rc}"); + base(&full, format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb)), 8) +} + +fn fmt_xo_2op(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let rc = rc_dot(instr); + let oe = if instr.oe() { "o" } else { "" }; + let full = format!("{mnem}{oe}{rc}"); + base(&full, format!("{}, {}", gpr(rt), gpr(ra)), 8) +} + +fn fmt_xo_3op_no_oe(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let oe = if instr.oe() { "o" } else { "" }; + let full = format!("{mnem}{oe}{rc}"); + base(&full, format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb)), 8) +} + +fn fmt_xo_3op_rc_only(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let full = format!("{mnem}{rc}"); + base(&full, format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb)), 8) +} + +fn fmt_subf(instr: &DecodedInstr, base_mnem: &str, ext_mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let oe = if instr.oe() { "o" } else { "" }; + let bm = format!("{base_mnem}{oe}{rc}"); + let em = format!("{ext_mnem}{oe}{rc}"); + let bo = format!("{}, {}, {}", gpr(rt), gpr(ra), gpr(rb)); + let eo = format!("{}, {}, {}", gpr(rt), gpr(rb), gpr(ra)); + with_ext(&bm, bo, 8, &em, eo, 8) +} + +// X-form logical +fn fmt_x_logic(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let full = format!("{mnem}{rc}"); + base(&full, format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)), 8) +} + +fn fmt_x_unary_rc(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rc = rc_dot(instr); + let full = format!("{mnem}{rc}"); + base(&full, format!("{}, {}", gpr(ra), gpr(rs)), 8) +} + +fn fmt_logic_and(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let bm = format!("and{rc}"); + let bo = format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)); + if rs == rb { + let em = format!("mr{rc}"); + with_ext(&bm, bo, 8, &em, format!("{}, {}", gpr(ra), gpr(rs)), 8) + } else { + base(&bm, bo, 8) + } +} + +fn fmt_logic_or(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let bm = format!("or{rc}"); + let bo = format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)); + if rs == rb { + let em = format!("mr{rc}"); + with_ext(&bm, bo, 8, &em, format!("{}, {}", gpr(ra), gpr(rs)), 8) + } else { + base(&bm, bo, 8) + } +} + +fn fmt_logic_nor(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let rc = rc_dot(instr); + let bm = format!("nor{rc}"); + let bo = format!("{}, {}, {}", gpr(ra), gpr(rs), gpr(rb)); + if rs == rb { + let em = format!("not{rc}"); + with_ext(&bm, bo, 8, &em, format!("{}, {}", gpr(ra), gpr(rs)), 8) + } else { + base(&bm, bo, 8) + } +} + +// Shift immediate +fn fmt_srawi(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let sh = instr.sh(); + let rc = rc_dot(instr); + base(&format!("srawi{rc}"), format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8) +} + +fn fmt_sradi(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let sh = instr.sh64(); + let rc = rc_dot(instr); + base(&format!("sradi{rc}"), format!("{}, {}, {sh}", gpr(ra), gpr(rs)), 8) +} + +// Special-purpose register moves +fn fmt_mfspr(instr: &DecodedInstr) -> DisasmText { + let rd = instr.rd(); + let spr = instr.spr(); + let base_ops = format!("{}, {}", gpr(rd), spr_name(spr)); + let ext = match spr { + 8 => Some(("mflr", format!("{}", gpr(rd)))), + 9 => Some(("mfctr", format!("{}", gpr(rd)))), + 1 => Some(("mfxer", format!("{}", gpr(rd)))), + _ => None, + }; + match ext { + Some((em, eo)) => with_ext("mfspr", base_ops, 8, em, eo, 8), + None => base("mfspr", base_ops, 8), + } +} + +fn fmt_mtspr(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let spr = instr.spr(); + let base_ops = format!("{}, {}", spr_name(spr), gpr(rs)); + let ext = match spr { + 8 => Some(("mtlr", format!("{}", gpr(rs)))), + 9 => Some(("mtctr", format!("{}", gpr(rs)))), + 1 => Some(("mtxer", format!("{}", gpr(rs)))), + _ => None, + }; + match ext { + Some((em, eo)) => with_ext("mtspr", base_ops, 8, em, eo, 8), + None => base("mtspr", base_ops, 8), + } +} + +fn fmt_mtcrf(instr: &DecodedInstr) -> DisasmText { + let rs = instr.rs(); + let fxm = (instr.raw >> 12) & 0xFF; + let bo = format!("0x{fxm:02X}, {}", gpr(rs)); + if fxm == 0xFF { + with_ext("mtcrf", bo, 8, "mtcr", gpr(rs), 8) + } else { + base("mtcrf", bo, 8) + } +} + +fn fmt_mftb(instr: &DecodedInstr) -> DisasmText { + let rd = instr.rd(); + let tbr = instr.spr(); + let base_ops = format!("{}, {tbr}", gpr(rd)); + match tbr { + 268 => with_ext("mftb", base_ops, 8, "mftb", gpr(rd), 8), + 269 => with_ext("mftb", base_ops, 8, "mftbu", gpr(rd), 8), + _ => base("mftb", base_ops, 8), + } +} + +// X-form indexed load/store. +fn fmt_x_load(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let rb = instr.rb(); + let rn = if is_fpr { fpr(rt) } else { gpr(rt) }; + base(mnem, format!("{rn}, {}, {}", gpr(ra), gpr(rb)), 8) +} + +fn fmt_x_store(instr: &DecodedInstr, mnem: &str, is_fpr: bool) -> DisasmText { + let rs = instr.rs(); + let ra = instr.ra(); + let rb = instr.rb(); + let rn = if is_fpr { fpr(rs) } else { gpr(rs) }; + base(mnem, format!("{rn}, {}, {}", gpr(ra), gpr(rb)), 8) +} + +fn fmt_lswi_stswi(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let rt = instr.rd(); + let ra = instr.ra(); + let nb = instr.nb(); + base(mnem, format!("{}, {}, {nb}", gpr(rt), gpr(ra)), 8) +} + +fn fmt_cache(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let ra = instr.ra(); + let rb = instr.rb(); + base(mnem, format!("{}, {}", gpr(ra), gpr(rb)), 8) +} + +// CR logical +fn fmt_cr_logic(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let bt = instr.crbd(); + let ba = instr.crba(); + let bb = instr.crbb(); + base(mnem, format!("{}, {}, {}", crb(bt), crb(ba), crb(bb)), 8) +} + +fn fmt_crnor(instr: &DecodedInstr) -> DisasmText { + let bt = instr.crbd(); + let ba = instr.crba(); + let bb = instr.crbb(); + let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb)); + if ba == bb { + with_ext("crnor", bo, 8, "crnot", format!("{}, {}", crb(bt), crb(ba)), 8) + } else { + base("crnor", bo, 8) + } +} + +fn fmt_crxor(instr: &DecodedInstr) -> DisasmText { + let bt = instr.crbd(); + let ba = instr.crba(); + let bb = instr.crbb(); + let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb)); + if bt == ba && ba == bb { + with_ext("crxor", bo, 8, "crclr", crb(bt), 8) + } else { + base("crxor", bo, 8) + } +} + +fn fmt_creqv(instr: &DecodedInstr) -> DisasmText { + let bt = instr.crbd(); + let ba = instr.crba(); + let bb = instr.crbb(); + let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb)); + if bt == ba && ba == bb { + with_ext("creqv", bo, 8, "crset", crb(bt), 8) + } else { + base("creqv", bo, 8) + } +} + +fn fmt_cror(instr: &DecodedInstr) -> DisasmText { + let bt = instr.crbd(); + let ba = instr.crba(); + let bb = instr.crbb(); + let bo = format!("{}, {}, {}", crb(bt), crb(ba), crb(bb)); + if ba == bb { + with_ext("cror", bo, 8, "crmove", format!("{}, {}", crb(bt), crb(ba)), 8) + } else { + base("cror", bo, 8) + } +} + +// FPU +fn fmt_a_3op(instr: &DecodedInstr, mnem: &str, use_frc: bool) -> DisasmText { + let frt = instr.rd(); + let fra = instr.ra(); + let frb = instr.rb(); + let frc = instr.rc(); + let rc = rc_dot(instr); + let full = format!("{mnem}{rc}"); + let ops = if use_frc { + format!("{}, {}, {}", fpr(frt), fpr(fra), fpr(frc)) + } else { + format!("{}, {}, {}", fpr(frt), fpr(fra), fpr(frb)) + }; + base(&full, ops, 8) +} + +fn fmt_a_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let frt = instr.rd(); + let frb = instr.rb(); + let rc = rc_dot(instr); + base(&format!("{mnem}{rc}"), format!("{}, {}", fpr(frt), fpr(frb)), 8) +} + +fn fmt_a_4op(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let frt = instr.rd(); + let fra = instr.ra(); + let frb = instr.rb(); + let frc = instr.rc(); + let rc = rc_dot(instr); + base(&format!("{mnem}{rc}"), + format!("{}, {}, {}, {}", fpr(frt), fpr(fra), fpr(frc), fpr(frb)), 8) +} + +fn fmt_fcmp(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let bf = instr.crfd(); + let fra = instr.ra(); + let frb = instr.rb(); + base(mnem, format!("cr{bf}, {}, {}", fpr(fra), fpr(frb)), 8) +} + +fn fmt_x_fpu_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let frt = instr.rd(); + let frb = instr.rb(); + let rc = rc_dot(instr); + base(&format!("{mnem}{rc}"), format!("{}, {}", fpr(frt), fpr(frb)), 8) +} + +fn fmt_mtfsb(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let bt = instr.crbd(); + let rc = rc_dot(instr); + base(&format!("{mnem}{rc}"), format!("{bt}"), 8) +} + +// VMX (5-bit registers). +fn fmt_vmx_3op(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let va = instr.ra(); + let vb = instr.rb(); + base(mnem, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 8) +} + +fn fmt_vmx_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let vb = instr.rb(); + base(mnem, format!("{}, {}", vr(vd), vr(vb)), 8) +} + +fn fmt_vmx_uimm(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let vb = instr.rb(); + let uimm = instr.ra() as u32; + base(mnem, format!("{}, {}, {uimm}", vr(vd), vr(vb)), 8) +} + +fn fmt_vmx_simm(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let simm = sign_ext(instr.ra() as u32, 5); + base(mnem, format!("{}, {simm}", vr(vd)), 9) +} + +fn fmt_vmx_cmp(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let va = instr.ra(); + let vb = instr.rb(); + // Rc bit at position 22 (0-indexed from MSB) + let rc = if (instr.raw >> 10) & 1 != 0 { "." } else { "" }; + let full = format!("{mnem}{rc}"); + base(&full, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 12) +} + +fn fmt_vmx_4op(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let va = instr.ra(); + let vb = instr.rb(); + let vc = instr.rc(); + base(mnem, format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vb), vr(vc)), 12) +} + +fn fmt_vmx_4op_swap(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let va = instr.ra(); + let vb = instr.rb(); + let vc = instr.rc(); + base(mnem, format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vc), vr(vb)), 9) +} + +fn fmt_vsldoi(instr: &DecodedInstr) -> DisasmText { + let vd = instr.rd(); + let va = instr.ra(); + let vb = instr.rb(); + let sh = (instr.raw >> 6) & 0xF; + base("vsldoi", format!("{}, {}, {}, {sh}", vr(vd), vr(va), vr(vb)), 8) +} + +fn fmt_vmx_ls(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.rd(); + let ra = instr.ra(); + let rb = instr.rb(); + base(mnem, format!("{}, {}, {}", vr(vd), gpr(ra), gpr(rb)), 8) +} + +// VMX128 — uses canonical va128/vb128/vd128 accessors from decoder.rs. +// (Silently fixes the prior ppc.rs bug where these used wrong bit positions.) +fn fmt_vmx128_ls(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.vd128(); + let ra = instr.ra(); + let rb = instr.rb(); + base(mnem, format!("{}, {}, {}", vr(vd), gpr(ra), gpr(rb)), 12) +} + +fn fmt_vmx128_3op(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.vd128(); + let va = instr.va128(); + let vb = instr.vb128(); + base(mnem, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 12) +} + +// VMX128 multiply-add forms (VX128_2): the addend is the VD register +// re-used, not a separate VC field. Operand order differs between +// `vmaddfp128` (VD, VA, VB, VD) and the `vmaddcfp128`/`vnmsubfp128` +// pair (VD, VA, VD, VB), per canary's authoritative formatters in +// xenia-canary/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc. +fn fmt_vmaddfp128(instr: &DecodedInstr) -> DisasmText { + let vd = instr.vd128(); + let va = instr.va128(); + let vb = instr.vb128(); + base( + "vmaddfp128", + format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vb), vr(vd)), + 12, + ) +} + +fn fmt_vmx128_madd_vd_vb(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.vd128(); + let va = instr.va128(); + let vb = instr.vb128(); + base( + mnem, + format!("{}, {}, {}, {}", vr(vd), vr(va), vr(vd), vr(vb)), + 12, + ) +} + +fn fmt_vperm128(instr: &DecodedInstr) -> DisasmText { + let vd = instr.vd128(); + let va = instr.va128(); + let vb = instr.vb128(); + let vc = (instr.raw >> 6) & 0x7; + base("vperm128", format!("{}, {}, {}, {vc}", vr(vd), vr(va), vr(vb)), 9) +} + +fn fmt_vsldoi128(instr: &DecodedInstr) -> DisasmText { + let vd = instr.vd128(); + let va = instr.va128(); + let vb = instr.vb128(); + let sh = (instr.raw >> 6) & 0xF; + base("vsldoi128", format!("{}, {}, {}, {sh}", vr(vd), vr(va), vr(vb)), 10) +} + +fn fmt_vpermwi128(instr: &DecodedInstr) -> DisasmText { + let vd = instr.vd128(); + let vb = instr.vb128(); + // UIMM combines bits 11-15 (low 5) with bits 23-25 (upper 3). + let lo = (instr.raw >> 16) & 0x1F; + let hi = (instr.raw >> 6) & 0x7; + let uimm = lo | (hi << 5); + base("vpermwi128", format!("{}, {}, 0x{uimm:X}", vr(vd), vr(vb)), 11) +} + +fn fmt_vmx128_pack_d3d(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.vd128(); + let vb = instr.vb128(); + let imm = (instr.raw >> 16) & 0x1F; + let z = (instr.raw >> 6) & 0x3; + base(mnem, format!("{}, {}, {imm}, {z}", vr(vd), vr(vb)), 10) +} + +fn fmt_vmx128_unary(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.vd128(); + let vb = instr.vb128(); + base(mnem, format!("{}, {}", vr(vd), vr(vb)), 12) +} + +fn fmt_vmx128_uimm(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.vd128(); + let vb = instr.vb128(); + let uimm = extract_vx128_uimm5(instr.raw); + base(mnem, format!("{}, {}, {uimm}", vr(vd), vr(vb)), 12) +} + +fn fmt_vmx128_cmp(instr: &DecodedInstr, mnem: &str) -> DisasmText { + let vd = instr.vd128(); + let va = instr.va128(); + let vb = instr.vb128(); + // Rc bit at position 25 in VMX128 cmp form. + let rc = if (instr.raw >> 6) & 1 != 0 { "." } else { "" }; + let full = format!("{mnem}{rc}"); + base(&full, format!("{}, {}, {}", vr(vd), vr(va), vr(vb)), 14) +} + #[cfg(test)] mod tests { use super::*; use crate::decoder::decode; #[test] - fn test_disasm_nop() { - // ori r0, r0, 0 = NOP + fn nop_collapses_via_extended() { let instr = decode(0x60000000, 0); - let text = disassemble(&instr); - assert!(text.contains("ori"), "Expected 'ori', got: {}", text); + let t = format(&instr); + assert_eq!(t.mnemonic, "ori"); + assert_eq!(t.ext_mnemonic.as_deref(), Some("nop")); + assert_eq!(t.display(), "nop"); } #[test] - fn test_disasm_addi() { - let raw = (14u32 << 26) | (3 << 21) | (1 << 16) | 16; + fn addi_to_li_when_ra_zero() { + // addi r3, r0, 16 + let raw = (14u32 << 26) | (3 << 21) | (0 << 16) | 16; let instr = decode(raw, 0); - let text = disassemble(&instr); - assert!(text.contains("addi"), "Got: {}", text); - assert!(text.contains("r3"), "Got: {}", text); + let t = format(&instr); + assert_eq!(t.mnemonic, "addi"); + assert_eq!(t.ext_mnemonic.as_deref(), Some("li")); + assert_eq!(t.ext_operands.as_deref(), Some("r3, 16")); + } + + #[test] + fn rlwinm_dot_preserves_record_bit() { + // Same pattern as the Sylpheed graphics-callback test: + // rlwinm. r11, r11, 0, 31, 31 with Rc=1 + let raw = (21u32 << 26) | (11 << 21) | (11 << 16) + | (0 << 11) | (31 << 6) | (31 << 1) | 1; + let instr = decode(raw, 0); + let t = format(&instr); + assert!(t.disasm.starts_with("rlwinm."), "got: {}", t.disasm); + } + + #[test] + fn rlwinm_no_dot_when_rc_unset() { + let raw = (21u32 << 26) | (11 << 21) | (11 << 16) + | (0 << 11) | (31 << 6) | (31 << 1); + let instr = decode(raw, 0); + let t = format(&instr); + assert_eq!(t.mnemonic, "rlwinm"); + assert!(!t.mnemonic.ends_with('.')); + } + + #[test] + fn or_with_same_source_is_mr() { + // or r3, r4, r4 → mr r3, r4 + let raw = (31u32 << 26) | (4 << 21) | (3 << 16) | (4 << 11) | (444 << 1); + let instr = decode(raw, 0); + let t = format(&instr); + assert_eq!(t.ext_mnemonic.as_deref(), Some("mr")); + assert_eq!(t.ext_operands.as_deref(), Some("r3, r4")); + } + + #[test] + fn unconditional_branch_resolves_target() { + // b +0x100 with addr=0x82000000 + let raw = (18u32 << 26) | (0x40 << 2); + let instr = decode(raw, 0x82000000); + let t = format(&instr); + assert_eq!(t.mnemonic, "b"); + assert_eq!(t.branch_target, Some(0x82000100)); + assert_eq!(t.operands, "0x82000100"); + } + + #[test] + fn bclr_unconditional_is_blr() { + // bclr 20, 0 + let raw = (19u32 << 26) | (20 << 21) | (0 << 16) | (16 << 1); + let instr = decode(raw, 0); + let t = format(&instr); + assert_eq!(t.ext_mnemonic.as_deref(), Some("blr")); + } + + #[test] + fn back_compat_disassemble_returns_display() { + let instr = decode(0x60000000, 0); + assert_eq!(disassemble(&instr), "nop"); + } + + #[test] + fn iter_disasm_walks_byte_slice_in_order() { + // Three instructions at 0x82000000: nop, addi r3,r0,16, b +0x100. + let mut bytes = Vec::new(); + bytes.extend_from_slice(&0x60000000u32.to_be_bytes()); // nop + bytes.extend_from_slice(&((14u32 << 26) | (3 << 21) | (0 << 16) | 16).to_be_bytes()); // addi + bytes.extend_from_slice(&((18u32 << 26) | (0x40 << 2)).to_be_bytes()); // b +0x100 + + let items: Vec<_> = super::iter_disasm(&bytes, 0x82000000, 0x82000000, 0x82000000 + 12) + .collect(); + assert_eq!(items.len(), 3); + assert_eq!(items[0].addr, 0x82000000); + assert_eq!(items[0].text.ext_mnemonic.as_deref(), Some("nop")); + assert_eq!(items[1].addr, 0x82000004); + assert_eq!(items[1].text.ext_mnemonic.as_deref(), Some("li")); + assert_eq!(items[2].addr, 0x82000008); + assert_eq!(items[2].text.branch_target, Some(0x82000108)); + } + + #[test] + fn iter_disasm_stops_on_truncated_tail() { + // 6 bytes — one full instruction + 2 dangling. Iterator must yield exactly 1. + let mut bytes = Vec::new(); + bytes.extend_from_slice(&0x60000000u32.to_be_bytes()); + bytes.push(0x60); bytes.push(0x00); + + let items: Vec<_> = super::iter_disasm(&bytes, 0, 0, 6).collect(); + assert_eq!(items.len(), 1); } } diff --git a/crates/xenia-cpu/src/fpscr.rs b/crates/xenia-cpu/src/fpscr.rs new file mode 100644 index 0000000..1d05e67 --- /dev/null +++ b/crates/xenia-cpu/src/fpscr.rs @@ -0,0 +1,384 @@ +//! FPSCR (Floating-Point Status and Control Register) maintenance. +//! +//! Scope per project plan: rounding modes honoured, plus the exception bits +//! games actually read (FX, FEX, VX, OX, UX, ZX, XX, FI, FPRF). Enabled- +//! exception dispatch (FE[0,1], VE/OE/UE/ZE/XE) is *not* modelled — games +//! running on Xenon almost never take FP traps. +//! +//! Bit layout (PowerISA, MSB-0 numbering; stored in a u32 with bit 31 = MSB): +//! +//! | PPC bit | u32 mask | Name | +//! |---------|-------------------------|-------------| +//! | 0 | `1<<31` | FX | +//! | 1 | `1<<30` | FEX | +//! | 2 | `1<<29` | VX (summary)| +//! | 3 | `1<<28` | OX | +//! | 4 | `1<<27` | UX | +//! | 5 | `1<<26` | ZX | +//! | 6 | `1<<25` | XX | +//! | 7 | `1<<24` | VXSNAN | +//! | 8 | `1<<23` | VXISI | +//! | 9 | `1<<22` | VXIDI | +//! | 10 | `1<<21` | VXZDZ | +//! | 11 | `1<<20` | VXIMZ | +//! | 12 | `1<<19` | VXVC | +//! | 13 | `1<<18` | FR | +//! | 14 | `1<<17` | FI | +//! | 15..19 | `0xF8000 >> 15` @ 15..19 | FPRF (5 bits)| +//! | 21 | `1<<10` | VXSOFT | +//! | 22 | `1<<9` | VXSQRT | +//! | 23 | `1<<8` | VXCVI | +//! | 30..31 | `0x3` | RN (2 bits) | + +use crate::context::PpcContext; + +pub const FX: u32 = 1 << 31; +pub const FEX: u32 = 1 << 30; +pub const VX: u32 = 1 << 29; +pub const OX: u32 = 1 << 28; +pub const UX: u32 = 1 << 27; +pub const ZX: u32 = 1 << 26; +pub const XX: u32 = 1 << 25; +pub const VXSNAN: u32 = 1 << 24; +pub const VXISI: u32 = 1 << 23; +pub const VXIDI: u32 = 1 << 22; +pub const VXZDZ: u32 = 1 << 21; +pub const VXIMZ: u32 = 1 << 20; +pub const VXVC: u32 = 1 << 19; +pub const FR: u32 = 1 << 18; +pub const FI: u32 = 1 << 17; +pub const FPRF_MASK: u32 = 0x1F << 12; // bits 15..19 +pub const VXSOFT: u32 = 1 << 10; +pub const VXSQRT: u32 = 1 << 9; +pub const VXCVI: u32 = 1 << 8; +pub const RN_MASK: u32 = 0x3; + +/// Union of all VX* bits (used for the VX summary recomputation). +pub const VX_ALL: u32 = VXSNAN | VXISI | VXIDI | VXZDZ | VXIMZ | VXVC | VXSOFT | VXSQRT | VXCVI; + +/// FPRF classification codes (5-bit, placed in FPSCR bits 15..19). +/// The high bit ("C" in PowerISA) distinguishes ±zero/±denormal/QNaN from +/// ±normal/±inf. The next 4 bits are (FL, FG, FE, FU) = (less, greater, equal, unordered). +pub mod fprf { + pub const QNAN: u8 = 0b1_0001; + pub const NEG_INF: u8 = 0b0_1001; + pub const NEG_NORMAL: u8 = 0b0_1000; + pub const NEG_DENORMAL: u8 = 0b1_1000; + pub const NEG_ZERO: u8 = 0b1_0010; + pub const POS_ZERO: u8 = 0b0_0010; + pub const POS_DENORMAL: u8 = 0b1_0100; + pub const POS_NORMAL: u8 = 0b0_0100; + pub const POS_INF: u8 = 0b0_0101; +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum RoundingMode { + NearestEven, // RN=00 + TowardZero, // RN=01 + TowardPosInf, // RN=10 + TowardNegInf, // RN=11 +} + +pub fn rounding_mode(ctx: &PpcContext) -> RoundingMode { + match ctx.fpscr & RN_MASK { + 0 => RoundingMode::NearestEven, + 1 => RoundingMode::TowardZero, + 2 => RoundingMode::TowardPosInf, + _ => RoundingMode::TowardNegInf, + } +} + +/// Classify a finite f64 into its FPRF 5-bit code. +pub fn classify_fprf(v: f64) -> u8 { + if v.is_nan() { + fprf::QNAN + } else if v.is_infinite() { + if v.is_sign_negative() { fprf::NEG_INF } else { fprf::POS_INF } + } else if v == 0.0 { + if v.is_sign_negative() { fprf::NEG_ZERO } else { fprf::POS_ZERO } + } else if v.is_subnormal() { + if v.is_sign_negative() { fprf::NEG_DENORMAL } else { fprf::POS_DENORMAL } + } else if v.is_sign_negative() { fprf::NEG_NORMAL } else { fprf::POS_NORMAL } +} + +/// Write FPRF into FPSCR, preserving other bits. +pub fn set_fprf(ctx: &mut PpcContext, code: u8) { + ctx.fpscr = (ctx.fpscr & !FPRF_MASK) | ((code as u32 & 0x1F) << 12); +} + +/// Set one or more exception bits on FPSCR, maintaining FX (sticky set on any +/// new exception) and VX (summary of VX* bits). +pub fn set_exception(ctx: &mut PpcContext, bits: u32) { + let prev = ctx.fpscr; + let new = prev | bits; + // FX is sticky-set if any new non-sticky bit transitions to 1. PPC defines + // FX as "any of OX, UX, ZX, XX, VX* newly set". Compute the transition set. + let transition = (new & !prev) & (OX | UX | ZX | XX | VX_ALL); + let mut updated = new; + if transition != 0 { + updated |= FX; + } + // Recompute VX summary from any VX* bits currently set. + if (updated & VX_ALL) != 0 { updated |= VX; } + ctx.fpscr = updated; +} + +/// Classify the inputs of a floating-point arithmetic op and set appropriate +/// VX* bits. Returns true if any invalid-operation was detected (caller may +/// want to write a default QNaN result). +/// +/// Detected cases: +/// * any SNaN input → VXSNAN +/// * infinity - infinity (same sign) → VXISI +/// * 0 / 0 → VXZDZ +/// * infinity / infinity → VXIDI +/// * 0 * infinity → VXIMZ +pub fn check_invalid_add(ctx: &mut PpcContext, a: f64, b: f64, sub: bool) -> bool { + let mut bits = 0u32; + if is_snan(a) || is_snan(b) { bits |= VXSNAN; } + if a.is_infinite() && b.is_infinite() { + // For add: VXISI iff same-sign(a,b) negated — inf - inf + // For sub: VXISI iff same-sign(a,b) — (+inf) - (+inf) or (-inf) - (-inf) + let both_pos = a.is_sign_positive() && b.is_sign_positive(); + let both_neg = a.is_sign_negative() && b.is_sign_negative(); + if sub { + if both_pos || both_neg { bits |= VXISI; } + } else { + // add: opposite signs cancel to inf-inf + if a.is_sign_positive() != b.is_sign_positive() { bits |= VXISI; } + } + } + if bits != 0 { set_exception(ctx, bits); return true; } + false +} + +pub fn check_invalid_mul(ctx: &mut PpcContext, a: f64, b: f64) -> bool { + let mut bits = 0u32; + if is_snan(a) || is_snan(b) { bits |= VXSNAN; } + let zero_times_inf = + (a == 0.0 && b.is_infinite()) || (b == 0.0 && a.is_infinite()); + if zero_times_inf { bits |= VXIMZ; } + if bits != 0 { set_exception(ctx, bits); return true; } + false +} + +pub fn check_invalid_div(ctx: &mut PpcContext, a: f64, b: f64) -> bool { + let mut bits = 0u32; + if is_snan(a) || is_snan(b) { bits |= VXSNAN; } + if a == 0.0 && b == 0.0 { bits |= VXZDZ; } + if a.is_infinite() && b.is_infinite() { bits |= VXIDI; } + if bits != 0 { set_exception(ctx, bits); return true; } + false +} + +/// Divide-by-zero (finite nonzero / 0) — sets ZX but not VX. +pub fn check_zero_divide(ctx: &mut PpcContext, a: f64, b: f64) { + if b == 0.0 && a != 0.0 && !a.is_nan() && !a.is_infinite() { + set_exception(ctx, ZX); + } +} + +/// Post-op: classify the result and update FPRF + detect overflow/underflow/inexact. +/// `inputs_finite` lets us suppress OX for ops whose output is infinite because +/// an input already was. +pub fn update_after_op(ctx: &mut PpcContext, result: f64, inputs_were_finite: bool) { + let mut bits = 0u32; + if result.is_infinite() && inputs_were_finite { + bits |= OX; + } + if result.is_subnormal() { + bits |= UX; + } + if bits != 0 { set_exception(ctx, bits); } + set_fprf(ctx, classify_fprf(result)); +} + +/// Test whether an f64 is a signalling NaN. +/// In IEEE 754-2008 (binary64), the signalling bit is the high bit of the +/// mantissa. SNaN has it clear, QNaN has it set. NaN with high mantissa bit +/// clear (and mantissa nonzero) is an SNaN. +pub fn is_snan(x: f64) -> bool { + if !x.is_nan() { return false; } + let bits = x.to_bits(); + // Highest mantissa bit (bit 51) clear ⇒ SNaN. Mantissa nonzero always true for NaN. + (bits & (1u64 << 51)) == 0 +} + +/// Round an f64 to f32 honouring FPSCR[RN]. Uses the current hardware +/// rounding mode when RN=0 (nearest-even, the PPC default), otherwise +/// emulates the directed rounding via bit-manipulation. +pub fn round_to_single(ctx: &PpcContext, v: f64) -> f64 { + match rounding_mode(ctx) { + RoundingMode::NearestEven => (v as f32) as f64, + RoundingMode::TowardZero => round_single_toward_zero(v) as f64, + RoundingMode::TowardPosInf => round_single_toward_pos_inf(v) as f64, + RoundingMode::TowardNegInf => round_single_toward_neg_inf(v) as f64, + } +} + +/// Round an f64 to an i64 integer honouring FPSCR[RN]. Used by fctidx. +pub fn round_to_i64(ctx: &PpcContext, v: f64) -> i64 { + match rounding_mode(ctx) { + RoundingMode::NearestEven => { + // Round-half-to-even (banker's rounding). + let r = v.round(); + // Rust's f64::round is round-half-away-from-zero. Correct ties to even: + let diff = (v - v.trunc()).abs(); + if (diff - 0.5).abs() < f64::EPSILON { + let floor = v.floor(); + if (floor as i64) & 1 == 0 { floor as i64 } else { v.ceil() as i64 } + } else { + r as i64 + } + } + RoundingMode::TowardZero => v.trunc() as i64, + RoundingMode::TowardPosInf => v.ceil() as i64, + RoundingMode::TowardNegInf => v.floor() as i64, + } +} + +/// Round an f64 to an i32 integer honouring FPSCR[RN]. Used by fctiwx. +pub fn round_to_i32(ctx: &PpcContext, v: f64) -> i32 { + round_to_i64(ctx, v).clamp(i32::MIN as i64, i32::MAX as i64) as i32 +} + +// ------ directed rounding helpers (f64 → f32) ------ + +fn round_single_toward_zero(v: f64) -> f32 { + // Default f64→f32 is round-to-nearest-even. Emulate truncation: + // take the default rounded value; if the absolute rounded magnitude + // exceeds |v|, bump down by one ULP toward zero. + let rn = v as f32; + if rn.is_nan() || rn.is_infinite() || rn == 0.0 { return rn; } + if rn.abs() as f64 <= v.abs() { return rn; } + let adj_bits = rn.to_bits(); + let lower = if rn.is_sign_positive() { adj_bits - 1 } else { adj_bits - 1 }; + f32::from_bits(lower) +} + +fn round_single_toward_pos_inf(v: f64) -> f32 { + let rn = v as f32; + if rn.is_nan() || rn.is_infinite() { return rn; } + if (rn as f64) >= v { return rn; } + // rn < v — bump up by one ULP in the +direction. + let b = rn.to_bits(); + let nb = if rn.is_sign_negative() { b - 1 } else { b + 1 }; + f32::from_bits(nb) +} + +fn round_single_toward_neg_inf(v: f64) -> f32 { + let rn = v as f32; + if rn.is_nan() || rn.is_infinite() { return rn; } + if (rn as f64) <= v { return rn; } + // rn > v — bump down. + let b = rn.to_bits(); + let nb = if rn.is_sign_negative() { b + 1 } else { b - 1 }; + f32::from_bits(nb) +} + +/// Drop-in replacement for the old `update_cr1_from_fpscr`. Reads the +/// currently-maintained FPSCR bits (FX, FEX, VX, OX) into CR1. +pub fn update_cr1(ctx: &mut PpcContext) { + ctx.cr[1].lt = (ctx.fpscr & FX) != 0; + ctx.cr[1].gt = (ctx.fpscr & FEX) != 0; + ctx.cr[1].eq = (ctx.fpscr & VX) != 0; + ctx.cr[1].so = (ctx.fpscr & OX) != 0; +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ctx() -> PpcContext { PpcContext::new() } + + #[test] + fn rn_default_is_nearest() { + assert_eq!(rounding_mode(&ctx()), RoundingMode::NearestEven); + } + + #[test] + fn rn_bits_decode() { + let mut c = ctx(); + c.fpscr = 0x1; + assert_eq!(rounding_mode(&c), RoundingMode::TowardZero); + c.fpscr = 0x2; + assert_eq!(rounding_mode(&c), RoundingMode::TowardPosInf); + c.fpscr = 0x3; + assert_eq!(rounding_mode(&c), RoundingMode::TowardNegInf); + } + + #[test] + fn fprf_classifies_correctly() { + assert_eq!(classify_fprf(1.0), fprf::POS_NORMAL); + assert_eq!(classify_fprf(-1.0), fprf::NEG_NORMAL); + assert_eq!(classify_fprf(0.0), fprf::POS_ZERO); + assert_eq!(classify_fprf(-0.0), fprf::NEG_ZERO); + assert_eq!(classify_fprf(f64::INFINITY), fprf::POS_INF); + assert_eq!(classify_fprf(f64::NEG_INFINITY), fprf::NEG_INF); + assert_eq!(classify_fprf(f64::NAN), fprf::QNAN); + assert_eq!(classify_fprf(f64::from_bits(1)), fprf::POS_DENORMAL); + } + + #[test] + fn fx_is_sticky_on_new_exception() { + let mut c = ctx(); + set_exception(&mut c, OX); + assert_ne!(c.fpscr & FX, 0); + // Clear FX/OX manually. + c.fpscr &= !(FX | OX); + // Re-set OX; FX should re-latch. + set_exception(&mut c, OX); + assert_ne!(c.fpscr & FX, 0); + } + + #[test] + fn vx_summary_set_on_any_vx_bit() { + let mut c = ctx(); + set_exception(&mut c, VXSNAN); + assert_ne!(c.fpscr & VX, 0); + assert_ne!(c.fpscr & VXSNAN, 0); + } + + #[test] + fn round_to_single_nearest_is_identity_on_representable() { + let c = ctx(); + assert_eq!(round_to_single(&c, 1.0_f64), 1.0_f64); + } + + #[test] + fn round_to_i32_clamps_out_of_range() { + let c = ctx(); + assert_eq!(round_to_i32(&c, 1e20_f64), i32::MAX); + assert_eq!(round_to_i32(&c, -1e20_f64), i32::MIN); + } + + #[test] + fn round_to_i64_nearest_even_on_tie() { + let c = ctx(); + assert_eq!(round_to_i64(&c, 2.5_f64), 2); + assert_eq!(round_to_i64(&c, 3.5_f64), 4); + assert_eq!(round_to_i64(&c, -2.5_f64), -2); + } + + #[test] + fn check_invalid_add_detects_inf_minus_inf() { + let mut c = ctx(); + assert!(check_invalid_add(&mut c, f64::INFINITY, f64::INFINITY, true)); + assert_ne!(c.fpscr & VXISI, 0); + } + + #[test] + fn check_invalid_div_detects_zero_over_zero() { + let mut c = ctx(); + assert!(check_invalid_div(&mut c, 0.0, 0.0)); + assert_ne!(c.fpscr & VXZDZ, 0); + } + + #[test] + fn snan_detection() { + // SNaN in binary64: sign=0, exp=all-ones, mantissa nonzero with bit 51 clear. + let snan = f64::from_bits(0x7FF0_0000_0000_0001); + assert!(is_snan(snan)); + assert!(!is_snan(f64::NAN)); + } +} diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs index 02c4bd7..4d26a97 100644 --- a/crates/xenia-cpu/src/interpreter.rs +++ b/crates/xenia-cpu/src/interpreter.rs @@ -4,9 +4,17 @@ use crate::context::PpcContext; use crate::decoder::{decode, DecodedInstr}; +use crate::fpscr; use crate::opcode::PpcOpcode; +use crate::overflow; +use crate::trap; +use crate::vmx; use xenia_memory::MemoryAccess; +/// Xenon reservation granule: one L2 cache line (128 bytes). +/// `reserved_line = ea & !RESERVATION_MASK` in [context::PpcContext]. +pub const RESERVATION_MASK: u32 = 0x7F; + /// Result of executing a single instruction. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum StepResult { @@ -23,7 +31,7 @@ pub enum StepResult { } /// Execute a single PPC instruction. -pub fn step(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess) -> StepResult { +pub fn step(ctx: &mut PpcContext, mem: &dyn MemoryAccess) -> StepResult { let raw = mem.read_u32(ctx.pc); let instr = decode(raw, ctx.pc); @@ -35,8 +43,72 @@ pub fn step(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess) -> StepResult { result } +/// Tier-2 perf — same semantics as [`step`], but looks the decoded +/// instruction up in a PC-keyed cache first. Misses fill the cache from +/// a fresh [`decode`] call; writes to the containing guest page bump +/// `page_version` and naturally invalidate the entry. +/// +/// The cache is shared across all HW threads — PC is thread-independent +/// and `DecodeCacheEntry` stays put after fill. `current_page_version` +/// is wired through the caller since memory is touched just above anyway +/// (the `read_u32` + the version read amortize to one touch of the page +/// table). Use `GuestMemory::page_version(pc)` to source it. +pub fn step_cached( + ctx: &mut PpcContext, + mem: &dyn MemoryAccess, + cache: &mut crate::decoder::DecodeCache, + current_page_version: u64, +) -> StepResult { + let raw = mem.read_u32(ctx.pc); + let instr = cache.lookup(ctx.pc, raw, current_page_version); + + let result = execute(ctx, mem, &instr); + + ctx.cycle_count += 1; + ctx.timebase += 1; + + result +} + +/// Tier-4 perf — execute every instruction in a pre-decoded +/// [`crate::block_cache::DecodedBlock`], bumping `cycle_count` and +/// `timebase` once per executed instruction. Bails out as soon as a +/// non-`Continue` step result fires (system call, trap, halt, or +/// unimplemented opcode), or when an instruction unexpectedly changes +/// the PC mid-block (defensive — only the terminator at the tail of +/// the block is allowed to do that). +/// +/// Caller (in `xenia-app/src/main.rs`) is responsible for choosing this +/// path only when **no per-instruction observation is requested** — +/// i.e., `Debugger::wants_hooks() == false` and no `--trace-*` flag is +/// active. Once those gates flip, the caller falls back to +/// [`step_cached`] so every PC remains observable. +pub fn step_block( + ctx: &mut PpcContext, + mem: &dyn MemoryAccess, + block: &crate::block_cache::DecodedBlock, +) -> StepResult { + let mut result = StepResult::Continue; + for instr in &block.instrs { + let expected_next = instr.addr.wrapping_add(4); + result = execute(ctx, mem, instr); + ctx.cycle_count += 1; + ctx.timebase += 1; + if !matches!(result, StepResult::Continue) { + return result; + } + // PC discontinuity within a block. By construction only the + // terminator (last instruction) can branch — and when it does, + // we want to stop here, not continue executing past it. + if ctx.pc != expected_next { + break; + } + } + result +} + /// Execute a decoded instruction, updating context and memory. -fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInstr) -> StepResult { +fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -> StepResult { match instr.opcode { // ===== ALU: Immediate ===== PpcOpcode::addi => { @@ -45,8 +117,17 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.pc += 4; } PpcOpcode::addis => { - let ra_val = if instr.ra() == 0 { 0 } else { ctx.gpr[instr.ra()] }; - ctx.gpr[instr.rd()] = ra_val.wrapping_add((instr.simm16() as i64 as u64) << 16); + // Xbox 360 user mode is 32-bit ABI (MSR.SF=0), so addis must + // produce a value whose upper 32 bits don't pollute downstream + // 64-bit arithmetic. The PPC ISA in 64-bit mode sign-extends + // simm16 before the shift, producing 0xFFFFFFFF_xxxx0000 for + // negative simm16 (high bit set). When this value flows into + // a 64-bit subfc against a zero-extended lwz value, the unsigned + // 64-bit comparison yields wrong CA. Truncate to 32 bits to + // simulate 32-bit ABI behavior. + let ra_val = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + let result = ra_val.wrapping_add((instr.simm16() as i64 as u64) << 16); + ctx.gpr[instr.rd()] = result as u32 as u64; ctx.pc += 4; } PpcOpcode::addic => { @@ -64,7 +145,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.xer_ca = if result < ra { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; // Update CR0 - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); ctx.pc += 4; } PpcOpcode::subficx => { @@ -89,10 +170,10 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = ra.wrapping_add(rb); ctx.gpr[instr.rd()] = result; if instr.oe() { - // TODO: overflow detection + overflow::apply(ctx, overflow::add_ov_64(ra, rb, result)); } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -102,8 +183,11 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = ra.wrapping_add(rb); ctx.xer_ca = if result < ra { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + overflow::apply(ctx, overflow::add_ov_64(ra, rb, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -114,8 +198,12 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = ra.wrapping_add(rb).wrapping_add(ca); ctx.xer_ca = if result < ra || (ca != 0 && result == ra) { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + let true_sum = (ra as i64 as i128) + (rb as i64 as i128) + (ca as i128); + overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -125,8 +213,12 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = ra.wrapping_add(ca); ctx.xer_ca = if result < ra { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + let true_sum = (ra as i64 as i128) + (ca as i128); + overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -136,8 +228,13 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = ra.wrapping_add(ca).wrapping_sub(1); ctx.xer_ca = if ra != 0 || ca != 0 { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + // RT <- RA + CA + (-1) + let true_sum = (ra as i64 as i128) + (ca as i128) - 1; + overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -146,8 +243,11 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let rb = ctx.gpr[instr.rb()]; let result = rb.wrapping_sub(ra); ctx.gpr[instr.rd()] = result; + if instr.oe() { + overflow::apply(ctx, overflow::sub_ov_64(ra, rb, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -157,8 +257,11 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = rb.wrapping_sub(ra); ctx.xer_ca = if rb >= ra { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + overflow::apply(ctx, overflow::sub_ov_64(ra, rb, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -169,8 +272,13 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = (!ra).wrapping_add(rb).wrapping_add(ca); ctx.xer_ca = if rb > ra || (rb == ra && ca != 0) { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + // RT <- !RA + RB + CA == RB - RA - 1 + CA + let true_sum = (rb as i64 as i128) - (ra as i64 as i128) - 1 + (ca as i128); + overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -178,10 +286,17 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let ra = ctx.gpr[instr.ra()]; let ca = ctx.xer_ca as u64; let result = (!ra).wrapping_add(ca); - ctx.xer_ca = if !ra != 0 || ca != 0 { 1 } else { 0 }; + // RT <- !RA + CA (no -1 term). 64-bit carry-out only when + // !RA = u64::MAX (i.e. RA = 0) AND CA = 1. + ctx.xer_ca = if ra == 0 && ca != 0 { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + // RT <- !RA + CA == -RA - 1 + CA + let true_sum = -(ra as i64 as i128) - 1 + (ca as i128); + overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } @@ -191,25 +306,39 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = (!ra).wrapping_add(ca).wrapping_sub(1); ctx.xer_ca = if (!ra) != 0 || ca != 0 { 1 } else { 0 }; ctx.gpr[instr.rd()] = result; + if instr.oe() { + // RT <- !RA + CA + (-1) == -RA - 2 + CA + let true_sum = -(ra as i64 as i128) - 2 + (ca as i128); + overflow::apply(ctx, overflow::sum_overflow_64(true_sum, result)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, result as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } PpcOpcode::negx => { let ra = ctx.gpr[instr.ra()]; - ctx.gpr[instr.rd()] = (!ra).wrapping_add(1); + let result = (!ra).wrapping_add(1); + ctx.gpr[instr.rd()] = result; + if instr.oe() { + overflow::apply(ctx, overflow::neg_ov_64(ra)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64); + ctx.update_cr_signed(0, result as i64); } ctx.pc += 4; } PpcOpcode::mullwx => { let ra = ctx.gpr[instr.ra()] as i32 as i64; let rb = ctx.gpr[instr.rb()] as i32 as i64; - ctx.gpr[instr.rd()] = ra.wrapping_mul(rb) as u64; + let product = ra.wrapping_mul(rb); + ctx.gpr[instr.rd()] = product as u64; + if instr.oe() { + // OV iff the 64-bit product can't fit into 32-bit signed. + overflow::apply(ctx, overflow::mullw_ov(product)); + } if instr.rc_bit() { - ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64); + ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } ctx.pc += 4; } @@ -219,7 +348,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = ra.wrapping_mul(rb); ctx.gpr[instr.rd()] = ((result >> 32) as i32 as i64 as u64) & 0xFFFF_FFFF; if instr.rc_bit() { - ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64); + ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } ctx.pc += 4; } @@ -229,33 +358,42 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let result = ra.wrapping_mul(rb); ctx.gpr[instr.rd()] = (result >> 32) & 0xFFFF_FFFF; if instr.rc_bit() { - ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64); + ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } ctx.pc += 4; } PpcOpcode::divwx => { let ra = ctx.gpr[instr.ra()] as i32; let rb = ctx.gpr[instr.rb()] as i32; - if rb == 0 || (ra == i32::MIN && rb == -1) { + let ov = overflow::divw_ov_signed(ra, rb); + if ov { + // PPC: RT undefined on div-by-zero / INT_MIN/-1. Canary uses 0. ctx.gpr[instr.rd()] = 0; } else { ctx.gpr[instr.rd()] = (ra / rb) as i64 as u64; } + if instr.oe() { + overflow::apply(ctx, ov); + } if instr.rc_bit() { - ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64); + ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } ctx.pc += 4; } PpcOpcode::divwux => { let ra = ctx.gpr[instr.ra()] as u32; let rb = ctx.gpr[instr.rb()] as u32; - if rb == 0 { + let ov = overflow::divw_ov_unsigned(rb); + if ov { ctx.gpr[instr.rd()] = 0; } else { ctx.gpr[instr.rd()] = (ra / rb) as u64; } + if instr.oe() { + overflow::apply(ctx, ov); + } if instr.rc_bit() { - ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i32 as i64); + ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } ctx.pc += 4; } @@ -265,6 +403,9 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let ra = ctx.gpr[instr.ra()] as i64; let rb = ctx.gpr[instr.rb()] as i64; ctx.gpr[instr.rd()] = ra.wrapping_mul(rb) as u64; + if instr.oe() { + overflow::apply(ctx, overflow::mulld_ov(ra, rb)); + } if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } @@ -291,11 +432,15 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst PpcOpcode::divdx => { let ra = ctx.gpr[instr.ra()] as i64; let rb = ctx.gpr[instr.rb()] as i64; - if rb == 0 || (ra == i64::MIN && rb == -1) { + let ov = overflow::divd_ov_signed(ra, rb); + if ov { ctx.gpr[instr.rd()] = 0; } else { ctx.gpr[instr.rd()] = (ra / rb) as u64; } + if instr.oe() { + overflow::apply(ctx, ov); + } if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } @@ -304,11 +449,15 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst PpcOpcode::divdux => { let ra = ctx.gpr[instr.ra()]; let rb = ctx.gpr[instr.rb()]; - if rb == 0 { + let ov = overflow::divd_ov_unsigned(rb); + if ov { ctx.gpr[instr.rd()] = 0; } else { ctx.gpr[instr.rd()] = ra / rb; } + if instr.oe() { + overflow::apply(ctx, ov); + } if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.rd()] as i64); } @@ -318,12 +467,12 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst // ===== Logical ===== PpcOpcode::andix => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & (instr.uimm16() as u64); - ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); + ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); ctx.pc += 4; } PpcOpcode::andisx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & ((instr.uimm16() as u64) << 16); - ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); + ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); ctx.pc += 4; } PpcOpcode::ori => { @@ -344,54 +493,54 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst } PpcOpcode::andx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & ctx.gpr[instr.rb()]; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::andcx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] & !ctx.gpr[instr.rb()]; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::orx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] | ctx.gpr[instr.rb()]; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::orcx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] | !ctx.gpr[instr.rb()]; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::xorx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] ^ ctx.gpr[instr.rb()]; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::norx => { ctx.gpr[instr.ra()] = !(ctx.gpr[instr.rs()] | ctx.gpr[instr.rb()]); - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::nandx => { ctx.gpr[instr.ra()] = !(ctx.gpr[instr.rs()] & ctx.gpr[instr.rb()]); - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::eqvx => { ctx.gpr[instr.ra()] = !(ctx.gpr[instr.rs()] ^ ctx.gpr[instr.rb()]); - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } // ===== Extend/Count ===== PpcOpcode::extsbx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] as i8 as i64 as u64; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::extshx => { ctx.gpr[instr.ra()] = ctx.gpr[instr.rs()] as i16 as i64 as u64; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::extswx => { @@ -401,7 +550,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst } PpcOpcode::cntlzwx => { ctx.gpr[instr.ra()] = (ctx.gpr[instr.rs()] as u32).leading_zeros() as u64; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::cntlzdx => { @@ -416,7 +565,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.gpr[instr.ra()] = if sh < 32 { ((ctx.gpr[instr.rs()] as u32) << sh) as u64 } else { 0 }; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::srwx => { @@ -424,7 +573,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.gpr[instr.ra()] = if sh < 32 { ((ctx.gpr[instr.rs()] as u32) >> sh) as u64 } else { 0 }; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::srawx => { @@ -441,7 +590,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.gpr[instr.ra()] = if rs < 0 { u64::MAX } else { 0 }; ctx.xer_ca = if rs < 0 { 1 } else { 0 }; } - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::srawix => { @@ -455,7 +604,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.xer_ca = if rs < 0 && (rs as u32) << (32 - sh) != 0 { 1 } else { 0 }; ctx.gpr[instr.ra()] = result as i64 as u64; } - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::sldx => { @@ -515,7 +664,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let rotated = rs.rotate_left(sh); let mask = rlw_mask(mb, me); ctx.gpr[instr.ra()] = (rotated & mask) as u64; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::rlwimix => { @@ -527,7 +676,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let mask = rlw_mask(mb, me); let ra = ctx.gpr[instr.ra()] as u32; ctx.gpr[instr.ra()] = ((rotated & mask) | (ra & !mask)) as u64; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::rlwnmx => { @@ -538,7 +687,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let rotated = rs.rotate_left(sh); let mask = rlw_mask(mb, me); ctx.gpr[instr.ra()] = (rotated & mask) as u64; - if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i32 as i64); } + if instr.rc_bit() { ctx.update_cr_signed(0, ctx.gpr[instr.ra()] as i64); } ctx.pc += 4; } PpcOpcode::rldiclx => { @@ -606,16 +755,26 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst PpcOpcode::cmpi => { let bf = instr.crfd(); if instr.l() { - // 64-bit compare + // 64-bit compare. Compare directly so boundary i64 values + // (e.g. ra=i64::MIN, imm=1) don't mis-sign through a + // wrapped subtract. let ra = ctx.gpr[instr.ra()] as i64; let imm = instr.simm16() as i64; - ctx.update_cr_signed(bf, ra - imm); - if ra == imm { ctx.cr[bf].eq = true; } + ctx.cr[bf] = crate::context::CrField { + lt: ra < imm, + gt: ra > imm, + eq: ra == imm, + so: ctx.xer_so != 0, + }; } else { let ra = ctx.gpr[instr.ra()] as i32; let imm = instr.simm16() as i32; - ctx.update_cr_signed(bf, (ra as i64) - (imm as i64)); - if ra == imm { ctx.cr[bf].eq = true; } + ctx.cr[bf] = crate::context::CrField { + lt: ra < imm, + gt: ra > imm, + eq: ra == imm, + so: ctx.xer_so != 0, + }; } ctx.pc += 4; } @@ -637,13 +796,21 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst if instr.l() { let ra = ctx.gpr[instr.ra()] as i64; let rb = ctx.gpr[instr.rb()] as i64; - ctx.update_cr_signed(bf, ra.wrapping_sub(rb)); - if ra == rb { ctx.cr[bf].eq = true; } + ctx.cr[bf] = crate::context::CrField { + lt: ra < rb, + gt: ra > rb, + eq: ra == rb, + so: ctx.xer_so != 0, + }; } else { let ra = ctx.gpr[instr.ra()] as i32; let rb = ctx.gpr[instr.rb()] as i32; - ctx.update_cr_signed(bf, (ra as i64).wrapping_sub(rb as i64)); - if ra == rb { ctx.cr[bf].eq = true; } + ctx.cr[bf] = crate::context::CrField { + lt: ra < rb, + gt: ra > rb, + eq: ra == rb, + so: ctx.xer_so != 0, + }; } ctx.pc += 4; } @@ -941,24 +1108,72 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst } // Reservation (lwarx/stwcx) + // + // M3.7 — when `ctx.reservation_table` is `Some` and the table is + // enabled, route reservations through the inter-thread table so + // concurrent host threads can mediate reservation conflicts. + // Otherwise (the default in lockstep mode), use the legacy + // per-`PpcContext` fields. Both paths leave the per-ctx fields + // in a coherent state so a flag flip mid-run doesn't corrupt + // outstanding reservations. PpcOpcode::lwarx => { let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; let ea = ea.wrapping_add(ctx.gpr[instr.rb()]) as u32; let val = mem.read_u32(ea); ctx.gpr[instr.rd()] = val as u64; - ctx.reserved_addr = ea; + ctx.reserved_line = ea & !RESERVATION_MASK; ctx.reserved_val = val as u64; ctx.has_reservation = true; + if let Some(t) = &ctx.reservation_table { + if t.is_enabled() { + ctx.reserved_generation = t.reserve(ea, ctx.hw_id); + } + } ctx.pc += 4; } PpcOpcode::stwcx => { let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; let ea = ea.wrapping_add(ctx.gpr[instr.rb()]) as u32; - if ctx.has_reservation && ctx.reserved_addr == ea { - mem.write_u32(ea, ctx.gpr[instr.rs()] as u32); - ctx.cr[0] = crate::context::CrField { lt: false, gt: false, eq: true, so: ctx.xer_so != 0 }; + let line = ea & !RESERVATION_MASK; + let table_route = ctx + .reservation_table + .as_ref() + .filter(|t| t.is_enabled()) + .cloned(); + let success = if let Some(t) = &table_route { + // Table-routed: success iff the slot still holds our + // reservation AND the per-ctx flag agrees (the per-ctx + // flag would be cleared by an intervening write or + // context switch). + ctx.has_reservation + && ctx.reserved_line == line + && t.try_commit(ea, ctx.reserved_generation, ctx.hw_id) } else { - ctx.cr[0] = crate::context::CrField { lt: false, gt: false, eq: false, so: ctx.xer_so != 0 }; + // Legacy per-ctx path (M2 default). + ctx.has_reservation && ctx.reserved_line == line + }; + if success { + mem.write_u32(ea, ctx.gpr[instr.rs()] as u32); + ctx.cr[0] = crate::context::CrField { + lt: false, + gt: false, + eq: true, + so: ctx.xer_so != 0, + }; + } else { + ctx.cr[0] = crate::context::CrField { + lt: false, + gt: false, + eq: false, + so: ctx.xer_so != 0, + }; + // Failed stwcx: if we held the reservation in the table + // (someone else displaced our gen), release it from the + // counter so `has_active_reservers` returns to zero + // when no real reserver exists. + if let Some(t) = &table_route { + t.release(ea, ctx.reserved_generation, ctx.hw_id); + } } ctx.has_reservation = false; ctx.pc += 4; @@ -1164,8 +1379,22 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst crate::context::spr::XER => ctx.xer() as u64, crate::context::spr::LR => ctx.lr, crate::context::spr::CTR => ctx.ctr, + crate::context::spr::DEC => ctx.dec as u64, crate::context::spr::TBL => ctx.timebase & 0xFFFF_FFFF, crate::context::spr::TBU => ctx.timebase >> 32, + crate::context::spr::VRSAVE => ctx.vrsave as u64, + // Xbox 360 Xenon processor signature (from canary). + crate::context::spr::PVR => 0x0071_0800, + // Benign SPRs — titles read these but we don't model them. + crate::context::spr::SPRG0 + | crate::context::spr::SPRG1 + | crate::context::spr::SPRG2 + | crate::context::spr::SPRG3 + | crate::context::spr::HID0 + | crate::context::spr::HID1 + | crate::context::spr::DAR + | crate::context::spr::DSISR + | crate::context::spr::PIR => 0, _ => { tracing::warn!("mfspr: unimplemented SPR {}", spr); 0 @@ -1180,6 +1409,24 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst crate::context::spr::XER => ctx.set_xer(val as u32), crate::context::spr::LR => ctx.lr = val, crate::context::spr::CTR => ctx.ctr = val, + crate::context::spr::DEC => ctx.dec = val as u32, + crate::context::spr::TBL_WRITE => { + ctx.timebase = (ctx.timebase & 0xFFFF_FFFF_0000_0000) | (val & 0xFFFF_FFFF); + } + crate::context::spr::TBU_WRITE => { + ctx.timebase = (ctx.timebase & 0x0000_0000_FFFF_FFFF) | ((val & 0xFFFF_FFFF) << 32); + } + crate::context::spr::VRSAVE => ctx.vrsave = val as u32, + // Benign writes — swallow silently to avoid false Unimplemented + // warnings on SPRs that have no observable effect in userspace. + crate::context::spr::SPRG0 + | crate::context::spr::SPRG1 + | crate::context::spr::SPRG2 + | crate::context::spr::SPRG3 + | crate::context::spr::HID0 + | crate::context::spr::HID1 + | crate::context::spr::DAR + | crate::context::spr::DSISR => {} _ => { tracing::warn!("mtspr: unimplemented SPR {}", spr); } @@ -1282,11 +1529,25 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst } // ===== Trap ===== - PpcOpcode::tdi | PpcOpcode::twi | PpcOpcode::td | PpcOpcode::tw => { - // For now, just trace and continue - tracing::warn!("Trap instruction at {:#010x}: {:?}", ctx.pc, instr.opcode); + PpcOpcode::tw | PpcOpcode::twi | PpcOpcode::td | PpcOpcode::tdi => { + let a = ctx.gpr[instr.ra()]; + let b = match instr.opcode { + PpcOpcode::twi | PpcOpcode::tdi => instr.simm16() as i64 as u64, + _ => ctx.gpr[instr.rb()], + }; + let width = match instr.opcode { + PpcOpcode::tw | PpcOpcode::twi => trap::TrapWidth::Word, + _ => trap::TrapWidth::Doubleword, + }; + let fired = trap::evaluate(instr.to(), a, b, width); ctx.pc += 4; - return StepResult::Trap; + if fired { + tracing::warn!( + "Trap fired at {:#010x}: {:?} TO={} a={:#x} b={:#x}", + ctx.pc.wrapping_sub(4), instr.opcode, instr.to(), a, b + ); + return StepResult::Trap; + } } // ===== Byte-reverse loads ===== @@ -1349,19 +1610,80 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.pc += 4; } // lvewx, lvebx, lvehx all load aligned 16 bytes (per xenia reference) - PpcOpcode::lvewx | PpcOpcode::lvebx | PpcOpcode::lvehx => { - let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; - let ea = (ea.wrapping_add(ctx.gpr[instr.rb()]) & !0xF) as u32; - let mut bytes = [0u8; 16]; - for i in 0..16 { bytes[i] = mem.read_u8(ea + i as u32); } + PpcOpcode::lvebx => { + // Load 1 byte from EA into vD[EA & 0xF]. PowerISA marks the + // other lanes as "undefined" but real Xenon (and Canary) + // preserve their prior contents, so seed from vD. + let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + let ea = base.wrapping_add(ctx.gpr[instr.rb()]) as u32; + let slot = (ea & 0xF) as usize; + let mut bytes = ctx.vr[instr.rd()].as_bytes(); + bytes[slot] = mem.read_u8(ea); ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(bytes); ctx.pc += 4; } - PpcOpcode::stvewx | PpcOpcode::stvebx | PpcOpcode::stvehx => { - let ea = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; - let ea = (ea.wrapping_add(ctx.gpr[instr.rb()]) & !0xF) as u32; + PpcOpcode::lvehx => { + // Load a halfword from (EA & ~1) into vD at halfword slot + // (EA & 0xF) >> 1. Other halfword lanes preserved (see lvebx). + let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32; + let ea = ea_unaligned & !0x1u32; + let slot = ((ea_unaligned & 0xF) >> 1) as usize; + let mut bytes = ctx.vr[instr.rd()].as_bytes(); + let h = mem.read_u16(ea); + bytes[slot * 2] = (h >> 8) as u8; + bytes[slot * 2 + 1] = (h & 0xFF) as u8; + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(bytes); + ctx.pc += 4; + } + PpcOpcode::lvewx => { + // Load a word from (EA & ~3) into vD at word slot + // (EA & 0xF) >> 2. Other word lanes preserved (see lvebx). + let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32; + let ea = ea_unaligned & !0x3u32; + let slot = ((ea_unaligned & 0xF) >> 2) as usize; + let mut bytes = ctx.vr[instr.rd()].as_bytes(); + let w = mem.read_u32(ea); + bytes[slot * 4] = (w >> 24) as u8; + bytes[slot * 4 + 1] = (w >> 16) as u8; + bytes[slot * 4 + 2] = (w >> 8) as u8; + bytes[slot * 4 + 3] = (w & 0xFF) as u8; + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(bytes); + ctx.pc += 4; + } + PpcOpcode::stvebx => { + // Store vS[EA & 0xF] (1 byte) to memory at EA. + let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + let ea = base.wrapping_add(ctx.gpr[instr.rb()]) as u32; + let slot = (ea & 0xF) as usize; let bytes = ctx.vr[instr.rs()].as_bytes(); - for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); } + mem.write_u8(ea, bytes[slot]); + ctx.pc += 4; + } + PpcOpcode::stvehx => { + // Store vS[slot] (1 halfword) at EA & ~1. slot = (EA & 0xF) >> 1. + let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32; + let ea = ea_unaligned & !0x1u32; + let slot = ((ea_unaligned & 0xF) >> 1) as usize; + let bytes = ctx.vr[instr.rs()].as_bytes(); + let h = ((bytes[slot * 2] as u16) << 8) | (bytes[slot * 2 + 1] as u16); + mem.write_u16(ea, h); + ctx.pc += 4; + } + PpcOpcode::stvewx => { + // Store vS[slot] (1 word) at EA & ~3. slot = (EA & 0xF) >> 2. + let base = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + let ea_unaligned = base.wrapping_add(ctx.gpr[instr.rb()]) as u32; + let ea = ea_unaligned & !0x3u32; + let slot = ((ea_unaligned & 0xF) >> 2) as usize; + let bytes = ctx.vr[instr.rs()].as_bytes(); + let w = ((bytes[slot * 4] as u32) << 24) + | ((bytes[slot * 4 + 1] as u32) << 16) + | ((bytes[slot * 4 + 2] as u32) << 8) + | (bytes[slot * 4 + 3] as u32); + mem.write_u32(ea, w); ctx.pc += 4; } PpcOpcode::lvxl | PpcOpcode::lvxl128 => { @@ -1417,40 +1739,69 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.pc += 4; } PpcOpcode::vmaddfp => { - // vD = (vA * vC) + vB + // vD = (vA * vC) + vB. AltiVec unconditionally flushes denormal + // *inputs* to 0 regardless of VSCR[NJ] (confirmed on POWER8 hw). let a = ctx.vr[instr.ra()].as_f32x4(); let b = ctx.vr[instr.rb()].as_f32x4(); let c = ctx.vr[instr.rc()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = a[i].mul_add(c[i], b[i]); } + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + let ci = vmx::flush_denorm(c[i]); + r[i] = ai.mul_add(ci, bi); + } ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } PpcOpcode::vmaddfp128 => { - let a = ctx.vr[instr.va128()].as_f32x4(); - let b = ctx.vr[instr.vb128()].as_f32x4(); - let d = ctx.vr[instr.vd128()].as_f32x4(); // vD is also source (accumulator) - let mut r = [0f32; 4]; - for i in 0..4 { r[i] = a[i].mul_add(b[i], d[i]); } - ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); - ctx.pc += 4; - } - PpcOpcode::vnmsubfp => { - // vD = -(vA * vC - vB) = vB - vA * vC - let a = ctx.vr[instr.ra()].as_f32x4(); - let b = ctx.vr[instr.rb()].as_f32x4(); - let c = ctx.vr[instr.rc()].as_f32x4(); - let mut r = [0f32; 4]; - for i in 0..4 { r[i] = b[i] - a[i] * c[i]; } - ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); - ctx.pc += 4; - } - PpcOpcode::vnmsubfp128 => { + // VMX128 form: vD <- (vA * vB) + vD (vD reused as accumulator; + // Canary `InstrEmit_vmaddfp128` routes guest VA/VB/VD through + // `InstrEmit_vmaddfp_` with arg order swapped so the resulting + // HIR computation is `VA * VB + VD`). Same unconditional denorm + // flush of all three inputs as scalar `vmaddfp`. let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let d = ctx.vr[instr.vd128()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = d[i] - a[i] * b[i]; } + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + let di = vmx::flush_denorm(d[i]); + r[i] = ai.mul_add(bi, di); + } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vnmsubfp => { + // vD = -(vA * vC - vB) = vB - vA * vC. Same denorm-flush rule as vmaddfp. + let a = ctx.vr[instr.ra()].as_f32x4(); + let b = ctx.vr[instr.rb()].as_f32x4(); + let c = ctx.vr[instr.rc()].as_f32x4(); + let mut r = [0f32; 4]; + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + let ci = vmx::flush_denorm(c[i]); + r[i] = bi - ai * ci; + } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vnmsubfp128 => { + // VMX128 form: vD <- -((vA * vB) - vD) = vD - (vA * vB). Canary + // routes through `InstrEmit_vnmsubfp_` with the same arg-swap, + // which flushes all inputs unconditionally. + let a = ctx.vr[instr.va128()].as_f32x4(); + let b = ctx.vr[instr.vb128()].as_f32x4(); + let d = ctx.vr[instr.vd128()].as_f32x4(); + let mut r = [0f32; 4]; + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + let di = vmx::flush_denorm(d[i]); + r[i] = di - ai * bi; + } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } @@ -1466,7 +1817,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let a = ctx.vr[instr.ra()].as_f32x4(); let b = ctx.vr[instr.rb()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = if a[i] > b[i] { a[i] } else { b[i] }; } + for i in 0..4 { r[i] = vmx::max_nan(a[i], b[i]); } ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } @@ -1474,7 +1825,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = if a[i] > b[i] { a[i] } else { b[i] }; } + for i in 0..4 { r[i] = vmx::max_nan(a[i], b[i]); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } @@ -1482,7 +1833,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let a = ctx.vr[instr.ra()].as_f32x4(); let b = ctx.vr[instr.rb()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = if a[i] < b[i] { a[i] } else { b[i] }; } + for i in 0..4 { r[i] = vmx::min_nan(a[i], b[i]); } ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } @@ -1490,7 +1841,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let a = ctx.vr[instr.va128()].as_f32x4(); let b = ctx.vr[instr.vb128()].as_f32x4(); let mut r = [0f32; 4]; - for i in 0..4 { r[i] = if a[i] < b[i] { a[i] } else { b[i] }; } + for i in 0..4 { r[i] = vmx::min_nan(a[i], b[i]); } ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); ctx.pc += 4; } @@ -1841,14 +2192,15 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst ctx.pc += 4; } - // VMX: MFVSCR/MTVSCR + // VMX: MFVSCR/MTVSCR — VSCR lives in word 3; only NJ (bit 16) and + // SAT (bit 31) are defined. Canary stores the full Vec128 so we do + // the same: mfvscr copies the register, mtvscr overwrites it. PpcOpcode::mfvscr => { - ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4(0, 0, 0, ctx.vscr_sat as u32); + ctx.vr[instr.rd()] = ctx.vscr; ctx.pc += 4; } PpcOpcode::mtvscr => { - let val = ctx.vr[instr.rb()].as_u32x4(); - ctx.vscr_sat = (val[3] & 1) as u8; + ctx.vscr = ctx.vr[instr.rb()]; ctx.pc += 4; } @@ -1888,89 +2240,176 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst // ===== FPU: Arithmetic ===== PpcOpcode::faddx => { - ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] + ctx.fpr[instr.rb()]; + let a = ctx.fpr[instr.ra()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_add(ctx, a, b, false); + let result = a + b; + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::faddsx => { - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] + ctx.fpr[instr.rb()]); + let a = ctx.fpr[instr.ra()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_add(ctx, a, b, false); + let result = to_single(ctx, a + b); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fsubx => { - ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] - ctx.fpr[instr.rb()]; + let a = ctx.fpr[instr.ra()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_add(ctx, a, b, true); + let result = a - b; + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fsubsx => { - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] - ctx.fpr[instr.rb()]); + let a = ctx.fpr[instr.ra()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_add(ctx, a, b, true); + let result = to_single(ctx, a - b); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fmulx => { // A-form: frD = frA * frC (frC is at rc() field, bits 21-25) - ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] * ctx.fpr[instr.rc()]; + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = a * c; + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fmulsx => { - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] * ctx.fpr[instr.rc()]); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = to_single(ctx, a * c); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fdivx => { - ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()] / ctx.fpr[instr.rb()]; + let a = ctx.fpr[instr.ra()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_div(ctx, a, b); + fpscr::check_zero_divide(ctx, a, b); + let result = a / b; + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && b != 0.0); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fdivsx => { - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()] / ctx.fpr[instr.rb()]); + let a = ctx.fpr[instr.ra()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_div(ctx, a, b); + fpscr::check_zero_divide(ctx, a, b); + let result = to_single(ctx, a / b); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && b != 0.0); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } // ===== FPU: Multiply-Add ===== PpcOpcode::fmaddx => { - // frD = (frA * frC) + frB - ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()]); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + fpscr::check_invalid_add(ctx, a * c, b, false); + let result = a.mul_add(c, b); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fmaddsx => { - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()])); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = to_single(ctx, a.mul_add(c, b)); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fmsubx => { - // frD = (frA * frC) - frB - ctx.fpr[instr.rd()] = ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()]); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = a.mul_add(c, -b); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fmsubsx => { - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()])); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = to_single(ctx, a.mul_add(c, -b)); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fnmaddx => { - // frD = -((frA * frC) + frB) - ctx.fpr[instr.rd()] = -(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()])); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = -(a.mul_add(c, b)); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fnmaddsx => { - ctx.fpr[instr.rd()] = to_single(-(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], ctx.fpr[instr.rb()]))); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = to_single(ctx, -(a.mul_add(c, b))); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fnmsubx => { - // frD = -((frA * frC) - frB) - ctx.fpr[instr.rd()] = -(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()])); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = -(a.mul_add(c, -b)); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fnmsubsx => { - ctx.fpr[instr.rd()] = to_single(-(ctx.fpr[instr.ra()].mul_add(ctx.fpr[instr.rc()], -ctx.fpr[instr.rb()]))); + let a = ctx.fpr[instr.ra()]; + let c = ctx.fpr[instr.rc()]; + let b = ctx.fpr[instr.rb()]; + fpscr::check_invalid_mul(ctx, a, c); + let result = to_single(ctx, -(a.mul_add(c, -b))); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, a.is_finite() && b.is_finite() && c.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } @@ -2011,39 +2450,87 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst // ===== FPU: Square root / Reciprocal ===== PpcOpcode::fsqrtx => { - ctx.fpr[instr.rd()] = ctx.fpr[instr.rb()].sqrt(); + let b = ctx.fpr[instr.rb()]; + // sqrt of negative (non-zero) is invalid operation → VXSQRT. + if b.is_sign_negative() && b != 0.0 && !b.is_nan() { + fpscr::set_exception(ctx, fpscr::VXSQRT); + } + if fpscr::is_snan(b) { + fpscr::set_exception(ctx, fpscr::VXSNAN); + } + let result = b.sqrt(); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, b.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fsqrtsx => { - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.rb()].sqrt()); + let b = ctx.fpr[instr.rb()]; + if b.is_sign_negative() && b != 0.0 && !b.is_nan() { + fpscr::set_exception(ctx, fpscr::VXSQRT); + } + if fpscr::is_snan(b) { + fpscr::set_exception(ctx, fpscr::VXSNAN); + } + let result = to_single(ctx, b.sqrt()); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, b.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fresx => { // Single-precision reciprocal estimate: frD = 1.0 / frB - ctx.fpr[instr.rd()] = to_single(1.0 / ctx.fpr[instr.rb()]); + let b = ctx.fpr[instr.rb()]; + if b == 0.0 { + fpscr::set_exception(ctx, fpscr::ZX); + } + if fpscr::is_snan(b) { + fpscr::set_exception(ctx, fpscr::VXSNAN); + } + let result = to_single(ctx, 1.0 / b); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, b.is_finite() && b != 0.0); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::frsqrtex => { // Reciprocal square root estimate: frD = 1.0 / sqrt(frB) - ctx.fpr[instr.rd()] = 1.0 / ctx.fpr[instr.rb()].sqrt(); + let b = ctx.fpr[instr.rb()]; + if b == 0.0 { + fpscr::set_exception(ctx, fpscr::ZX); + } + if b.is_sign_negative() && b != 0.0 && !b.is_nan() { + fpscr::set_exception(ctx, fpscr::VXSQRT); + } + if fpscr::is_snan(b) { + fpscr::set_exception(ctx, fpscr::VXSNAN); + } + let result = 1.0 / b.sqrt(); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, b.is_finite() && b > 0.0); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } // ===== FPU: Rounding/Conversion ===== PpcOpcode::frspx => { - // Round to single precision - ctx.fpr[instr.rd()] = to_single(ctx.fpr[instr.rb()]); + // Round to single precision honouring FPSCR[RN] + let b = ctx.fpr[instr.rb()]; + if fpscr::is_snan(b) { + fpscr::set_exception(ctx, fpscr::VXSNAN); + } + let result = to_single(ctx, b); + ctx.fpr[instr.rd()] = result; + fpscr::update_after_op(ctx, result, b.is_finite()); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fcfidx => { - // Convert from integer doubleword: frD = (double)(int64_t)frD_as_bits + // Convert from integer doubleword: frD = (double)(int64_t)frB_as_bits let bits = ctx.fpr[instr.rb()].to_bits(); - ctx.fpr[instr.rd()] = bits as i64 as f64; + let result = (bits as i64) as f64; + ctx.fpr[instr.rd()] = result; + fpscr::set_fprf(ctx, fpscr::classify_fprf(result)); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } @@ -2051,10 +2538,16 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst // Convert to integer doubleword (round per FPSCR[RN]) let val = ctx.fpr[instr.rb()]; let result = if val.is_nan() { + fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 }); + 0x8000_0000_0000_0000u64 + } else if val >= (i64::MAX as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); + 0x7FFF_FFFF_FFFF_FFFFu64 + } else if val < (i64::MIN as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); 0x8000_0000_0000_0000u64 } else { - let rounded = val.round(); - (rounded as i64) as u64 + fpscr::round_to_i64(ctx, val) as u64 }; ctx.fpr[instr.rd()] = f64::from_bits(result); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } @@ -2064,9 +2557,16 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst // Convert to integer doubleword (round toward zero) let val = ctx.fpr[instr.rb()]; let result = if val.is_nan() { + fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 }); + 0x8000_0000_0000_0000u64 + } else if val >= (i64::MAX as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); + 0x7FFF_FFFF_FFFF_FFFFu64 + } else if val < (i64::MIN as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); 0x8000_0000_0000_0000u64 } else { - (val as i64) as u64 + (val.trunc() as i64) as u64 }; ctx.fpr[instr.rd()] = f64::from_bits(result); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } @@ -2075,27 +2575,38 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst PpcOpcode::fctiwx => { // Convert to integer word (round per FPSCR[RN]) let val = ctx.fpr[instr.rb()]; - let result = if val.is_nan() { - 0x8000_0000u64 + let result_u32: u32 = if val.is_nan() { + fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 }); + 0x8000_0000 + } else if val > (i32::MAX as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); + 0x7FFF_FFFF + } else if val < (i32::MIN as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); + 0x8000_0000 } else { - let rounded = val.round(); - let clamped = rounded.clamp(i32::MIN as f64, i32::MAX as f64); - (clamped as i32 as u32) as u64 + fpscr::round_to_i32(ctx, val) as u32 }; - ctx.fpr[instr.rd()] = f64::from_bits(result); + ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::fctiwzx => { - // Convert to integer word (round toward zero) -- most common + // Convert to integer word (round toward zero) let val = ctx.fpr[instr.rb()]; - let result = if val.is_nan() { - 0x8000_0000u64 + let result_u32: u32 = if val.is_nan() { + fpscr::set_exception(ctx, fpscr::VXCVI | if fpscr::is_snan(val) { fpscr::VXSNAN } else { 0 }); + 0x8000_0000 + } else if val > (i32::MAX as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); + 0x7FFF_FFFF + } else if val < (i32::MIN as f64) { + fpscr::set_exception(ctx, fpscr::VXCVI); + 0x8000_0000 } else { - let clamped = val.clamp(i32::MIN as f64, i32::MAX as f64); - (clamped as i32 as u32) as u64 + val.trunc() as i32 as u32 }; - ctx.fpr[instr.rd()] = f64::from_bits(result); + ctx.fpr[instr.rd()] = f64::from_bits(result_u32 as u64); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } @@ -2106,54 +2617,60 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst let frb = ctx.fpr[instr.rb()]; let crfd = instr.crfd(); if fra.is_nan() || frb.is_nan() { - ctx.cr[crfd].lt = false; - ctx.cr[crfd].gt = false; - ctx.cr[crfd].eq = false; - ctx.cr[crfd].so = true; + ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: false, so: true }; + // fcmpu: VXSNAN on SNaN input; no VXVC even on QNaN. + if fpscr::is_snan(fra) || fpscr::is_snan(frb) { + fpscr::set_exception(ctx, fpscr::VXSNAN); + } } else if fra < frb { - ctx.cr[crfd].lt = true; - ctx.cr[crfd].gt = false; - ctx.cr[crfd].eq = false; - ctx.cr[crfd].so = false; + ctx.cr[crfd] = crate::context::CrField { lt: true, gt: false, eq: false, so: false }; } else if fra > frb { - ctx.cr[crfd].lt = false; - ctx.cr[crfd].gt = true; - ctx.cr[crfd].eq = false; - ctx.cr[crfd].so = false; + ctx.cr[crfd] = crate::context::CrField { lt: false, gt: true, eq: false, so: false }; } else { - ctx.cr[crfd].lt = false; - ctx.cr[crfd].gt = false; - ctx.cr[crfd].eq = true; - ctx.cr[crfd].so = false; + ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: true, so: false }; } + // Also mirror the comparison result into FPSCR[FPRF (FL/FG/FE/FU)]. + let fprf = if fra.is_nan() || frb.is_nan() { + 0b0_0001 + } else if fra < frb { + 0b0_1000 + } else if fra > frb { + 0b0_0100 + } else { + 0b0_0010 + }; + fpscr::set_fprf(ctx, fprf); ctx.pc += 4; } PpcOpcode::fcmpo => { - // Same as fcmpu but sets FPSCR exception bits for QNaN (not modeled yet) + // Ordered compare: like fcmpu but also sets VXVC on QNaN (or VXSNAN on SNaN). let fra = ctx.fpr[instr.ra()]; let frb = ctx.fpr[instr.rb()]; let crfd = instr.crfd(); if fra.is_nan() || frb.is_nan() { - ctx.cr[crfd].lt = false; - ctx.cr[crfd].gt = false; - ctx.cr[crfd].eq = false; - ctx.cr[crfd].so = true; + ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: false, so: true }; + if fpscr::is_snan(fra) || fpscr::is_snan(frb) { + fpscr::set_exception(ctx, fpscr::VXSNAN | fpscr::VXVC); + } else { + fpscr::set_exception(ctx, fpscr::VXVC); + } } else if fra < frb { - ctx.cr[crfd].lt = true; - ctx.cr[crfd].gt = false; - ctx.cr[crfd].eq = false; - ctx.cr[crfd].so = false; + ctx.cr[crfd] = crate::context::CrField { lt: true, gt: false, eq: false, so: false }; } else if fra > frb { - ctx.cr[crfd].lt = false; - ctx.cr[crfd].gt = true; - ctx.cr[crfd].eq = false; - ctx.cr[crfd].so = false; + ctx.cr[crfd] = crate::context::CrField { lt: false, gt: true, eq: false, so: false }; } else { - ctx.cr[crfd].lt = false; - ctx.cr[crfd].gt = false; - ctx.cr[crfd].eq = true; - ctx.cr[crfd].so = false; + ctx.cr[crfd] = crate::context::CrField { lt: false, gt: false, eq: true, so: false }; } + let fprf = if fra.is_nan() || frb.is_nan() { + 0b0_0001 + } else if fra < frb { + 0b0_1000 + } else if fra > frb { + 0b0_0100 + } else { + 0b0_0010 + }; + fpscr::set_fprf(ctx, fprf); ctx.pc += 4; } @@ -2166,7 +2683,7 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst } PpcOpcode::mtfsfx => { // Move to FPSCR fields: fm mask in bits 7-14, frB value - let fm = ((instr.raw >> 17) & 0xFF) as u32; + let fm = (instr.raw >> 17) & 0xFF; let val = ctx.fpr[instr.rb()].to_bits() as u32; let mut mask = 0u32; for i in 0..8 { @@ -2181,27 +2698,1559 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst PpcOpcode::mtfsb0x => { // Clear FPSCR bit crbd let bit = instr.crbd(); - ctx.fpscr &= !(1 << (31 - bit as u32)); + ctx.fpscr &= !(1 << (31 - bit)); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::mtfsb1x => { // Set FPSCR bit crbd let bit = instr.crbd(); - ctx.fpscr |= 1 << (31 - bit as u32); + ctx.fpscr |= 1 << (31 - bit); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } PpcOpcode::mtfsfix => { // Move to FPSCR field immediate: crfD = IMM (4 bits) let crfd = instr.crfd(); - let imm = ((instr.raw >> 12) & 0xF) as u32; + let imm = (instr.raw >> 12) & 0xF; let shift = 28 - crfd as u32 * 4; ctx.fpscr = (ctx.fpscr & !(0xF << shift)) | (imm << shift); if instr.rc_bit() { update_cr1_from_fpscr(ctx); } ctx.pc += 4; } + // ═════════════════════════════════════════════════════════════════ + // §4b — Unaligned vector load/store + // ═════════════════════════════════════════════════════════════════ + // lvlx / lvlx128 / lvlxl / lvlxl128: load left-aligned from EA. + PpcOpcode::lvlx | PpcOpcode::lvlxl => { + let ea = ea_indexed(ctx, instr); + ctx.vr[instr.rd()] = crate::vmx::load_vector_left(mem, ea); + ctx.pc += 4; + } + PpcOpcode::lvlx128 | PpcOpcode::lvlxl128 => { + let ea = ea_indexed(ctx, instr); + ctx.vr[instr.vd128()] = crate::vmx::load_vector_left(mem, ea); + ctx.pc += 4; + } + PpcOpcode::lvrx | PpcOpcode::lvrxl => { + let ea = ea_indexed(ctx, instr); + ctx.vr[instr.rd()] = crate::vmx::load_vector_right(mem, ea); + ctx.pc += 4; + } + PpcOpcode::lvrx128 | PpcOpcode::lvrxl128 => { + let ea = ea_indexed(ctx, instr); + ctx.vr[instr.vd128()] = crate::vmx::load_vector_right(mem, ea); + ctx.pc += 4; + } + PpcOpcode::stvlx | PpcOpcode::stvlxl => { + let ea = ea_indexed(ctx, instr); + crate::vmx::store_vector_left(mem, ea, ctx.vr[instr.rs()]); + ctx.pc += 4; + } + PpcOpcode::stvlx128 | PpcOpcode::stvlxl128 => { + let ea = ea_indexed(ctx, instr); + crate::vmx::store_vector_left(mem, ea, ctx.vr[instr.vs128()]); + ctx.pc += 4; + } + PpcOpcode::stvrx | PpcOpcode::stvrxl => { + let ea = ea_indexed(ctx, instr); + crate::vmx::store_vector_right(mem, ea, ctx.vr[instr.rs()]); + ctx.pc += 4; + } + PpcOpcode::stvrx128 | PpcOpcode::stvrxl128 => { + let ea = ea_indexed(ctx, instr); + crate::vmx::store_vector_right(mem, ea, ctx.vr[instr.vs128()]); + ctx.pc += 4; + } + // lvewx128 / stvewx128: VMX128 element-indexed 32-bit load/store. + // Like lvewx the whole 16 bytes at the aligned EA go into VD; the + // element-of-interest is implied by EA's low bits. + PpcOpcode::lvewx128 => { + let ea = ea_indexed(ctx, instr) & !0xF; + let mut bytes = [0u8; 16]; + for i in 0..16 { bytes[i] = mem.read_u8(ea + i as u32); } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_bytes(bytes); + ctx.pc += 4; + } + PpcOpcode::stvewx128 => { + let ea = ea_indexed(ctx, instr) & !0xF; + let bytes = ctx.vr[instr.vs128()].as_bytes(); + for i in 0..16 { mem.write_u8(ea + i as u32, bytes[i]); } + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4a — VMX integer add/sub (modulo and saturating), mul, avg, sum + // ═════════════════════════════════════════════════════════════════ + // -------- modulo add/sub (byte/halfword/word) -------- + PpcOpcode::vaddubm => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = a[i].wrapping_add(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vsububm => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = a[i].wrapping_sub(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vadduhm => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[i].wrapping_add(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vsubuhm => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[i].wrapping_sub(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + // vadduwm / vsubuwm are implemented above (modulo word add/sub arms). + + // -------- saturating add/sub (signed + unsigned) -------- + PpcOpcode::vaddubs => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + let mut sat = false; + for i in 0..16 { + let (v, s) = crate::vmx::sat_add_u8(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vsububs => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; let mut sat = false; + for i in 0..16 { + let (v, s) = crate::vmx::sat_sub_u8(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vaddsbs => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0i8; 16]; let mut sat = false; + for i in 0..16 { + let (v, s) = crate::vmx::sat_add_i8(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r); + ctx.pc += 4; + } + PpcOpcode::vsubsbs => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0i8; 16]; let mut sat = false; + for i in 0..16 { + let (v, s) = crate::vmx::sat_sub_i8(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r); + ctx.pc += 4; + } + PpcOpcode::vadduhs => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; let mut sat = false; + for i in 0..8 { + let (v, s) = crate::vmx::sat_add_u16(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vsubuhs => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; let mut sat = false; + for i in 0..8 { + let (v, s) = crate::vmx::sat_sub_u16(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vaddshs => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i16; 8]; let mut sat = false; + for i in 0..8 { + let (v, s) = crate::vmx::sat_add_i16(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vsubshs => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i16; 8]; let mut sat = false; + for i in 0..8 { + let (v, s) = crate::vmx::sat_sub_i16(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vadduws => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::sat_add_u32(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vsubuws => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::sat_sub_u32(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vaddsws => { + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::sat_add_i32(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vsubsws => { + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::sat_sub_i32(a[i], b[i]); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + + // -------- vaddcuw / vsubcuw: per-lane carry / borrow out -------- + PpcOpcode::vaddcuw => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { + let (_, c) = a[i].overflowing_add(b[i]); + r[i] = if c { 1 } else { 0 }; + } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vsubcuw => { + // "Subtract Carryout": r = 1 if a >= b (no borrow), 0 otherwise. + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = if a[i] >= b[i] { 1 } else { 0 }; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + + // -------- averages -------- + PpcOpcode::vavgub => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = crate::vmx::avg_u8(a[i], b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vavgsb => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0i8; 16]; + for i in 0..16 { r[i] = crate::vmx::avg_i8(a[i], b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r); + ctx.pc += 4; + } + PpcOpcode::vavguh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = crate::vmx::avg_u16(a[i], b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vavgsh => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = crate::vmx::avg_i16(a[i], b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vavguw => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = crate::vmx::avg_u32(a[i], b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vavgsw => { + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; + for i in 0..4 { r[i] = crate::vmx::avg_i32(a[i], b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + + // -------- multiplies (even / odd lanes — see §5 hazard note) -------- + // vmuleub: even u8 lanes (BE index 0,2,4,...,14) → u16 lanes. + PpcOpcode::vmuleub => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[2 * i] as u16 * b[2 * i] as u16; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vmuloub => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[2 * i + 1] as u16 * b[2 * i + 1] as u16; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vmulesb => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = a[2 * i] as i16 * b[2 * i] as i16; } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vmulosb => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = a[2 * i + 1] as i16 * b[2 * i + 1] as i16; } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vmuleuh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = a[2 * i] as u32 * b[2 * i] as u32; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vmulouh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = a[2 * i + 1] as u32 * b[2 * i + 1] as u32; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vmulesh => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; + for i in 0..4 { r[i] = a[2 * i] as i32 * b[2 * i] as i32; } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vmulosh => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; + for i in 0..4 { r[i] = a[2 * i + 1] as i32 * b[2 * i + 1] as i32; } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + + // -------- multiply-add halfword (saturating) -------- + PpcOpcode::vmhaddshs => { + // vD[i] = sat_i16((vA[i] * vB[i]) >> 15 + vC[i]) + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let c = crate::vmx::as_i16x8(ctx.vr[instr.rc()]); + let mut r = [0i16; 8]; let mut sat = false; + for i in 0..8 { + let prod = (a[i] as i32 * b[i] as i32) >> 15; + let (v, s) = crate::vmx::sat_i32_to_i16(prod + c[i] as i32); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vmhraddshs => { + // Rounded multiply-add: (vA[i]*vB[i] + 0x4000) >> 15 + vC[i], saturating. + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let c = crate::vmx::as_i16x8(ctx.vr[instr.rc()]); + let mut r = [0i16; 8]; let mut sat = false; + for i in 0..8 { + let prod = (a[i] as i32 * b[i] as i32 + 0x4000) >> 15; + let (v, s) = crate::vmx::sat_i32_to_i16(prod + c[i] as i32); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vmladduhm => { + // Multiply-low add (modulo): vD[i] = u16(vA[i] * vB[i] + vC[i]). + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let c = ctx.vr[instr.rc()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { + r[i] = a[i].wrapping_mul(b[i]).wrapping_add(c[i]); + } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + + // -------- VMX sum-of-products -------- + // vmsumubm: vD[i:u32] = sum over j in [0..4] of vA[4i+j:u8] * vB[4i+j:u8] + vC[i]. + PpcOpcode::vmsumubm => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let c = ctx.vr[instr.rc()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { + let mut s = c[i]; + for j in 0..4 { + s = s.wrapping_add(a[4*i+j] as u32 * b[4*i+j] as u32); + } + r[i] = s; + } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vmsummbm => { + // signed bytes × unsigned bytes, signed accumulator + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = ctx.vr[instr.rb()].as_bytes(); + let c = crate::vmx::as_i32x4(ctx.vr[instr.rc()]); + let mut r = [0i32; 4]; + for i in 0..4 { + let mut s = c[i]; + for j in 0..4 { + s = s.wrapping_add(a[4*i+j] as i32 * b[4*i+j] as i32); + } + r[i] = s; + } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vmsumuhm => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let c = ctx.vr[instr.rc()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { + let s = (a[2*i] as u32 * b[2*i] as u32) + .wrapping_add(a[2*i+1] as u32 * b[2*i+1] as u32) + .wrapping_add(c[i]); + r[i] = s; + } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vmsumuhs => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let c = ctx.vr[instr.rc()].as_u32x4(); + let mut r = [0u32; 4]; let mut sat = false; + for i in 0..4 { + let s = (a[2*i] as u64 * b[2*i] as u64) + + (a[2*i+1] as u64 * b[2*i+1] as u64) + + c[i] as u64; + let (v, overflow) = if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }; + r[i] = v; sat |= overflow; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vmsumshm => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let c = crate::vmx::as_i32x4(ctx.vr[instr.rc()]); + let mut r = [0i32; 4]; + for i in 0..4 { + let s = (a[2*i] as i32 * b[2*i] as i32) + .wrapping_add(a[2*i+1] as i32 * b[2*i+1] as i32) + .wrapping_add(c[i]); + r[i] = s; + } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vmsumshs => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let c = crate::vmx::as_i32x4(ctx.vr[instr.rc()]); + let mut r = [0i32; 4]; let mut sat = false; + for i in 0..4 { + // Running-sum saturation: accumulate in i64, clamp once at end. + let s = (a[2*i] as i64 * b[2*i] as i64) + + (a[2*i+1] as i64 * b[2*i+1] as i64) + + c[i] as i64; + let (v, o) = crate::vmx::sat_i64_to_i32(s); + r[i] = v; sat |= o; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + + // -------- VMX sum-across -------- + PpcOpcode::vsumsws => { + // vD[3] = sat_i32(vC[3] + sum over i in 0..4 of vA[i]) + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let s = a.iter().map(|&x| x as i64).sum::() + c[3] as i64; + let (v, sat) = crate::vmx::sat_i64_to_i32(s); + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4([0, 0, 0, v]); + ctx.pc += 4; + } + PpcOpcode::vsum2sws => { + // Two 2-word partial sums at lanes 1 and 3. + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let s0 = a[0] as i64 + a[1] as i64 + c[1] as i64; + let s1 = a[2] as i64 + a[3] as i64 + c[3] as i64; + let (v0, sat0) = crate::vmx::sat_i64_to_i32(s0); + let (v1, sat1) = crate::vmx::sat_i64_to_i32(s1); + if sat0 | sat1 { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4([0, v0, 0, v1]); + ctx.pc += 4; + } + PpcOpcode::vsum4sbs => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; let mut sat = false; + for i in 0..4 { + let s = a[4*i] as i64 + a[4*i+1] as i64 + a[4*i+2] as i64 + a[4*i+3] as i64 + c[i] as i64; + let (v, o) = crate::vmx::sat_i64_to_i32(s); + r[i] = v; sat |= o; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vsum4ubs => { + let a = ctx.vr[instr.ra()].as_bytes(); + let c = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; let mut sat = false; + for i in 0..4 { + let s = a[4*i] as u64 + a[4*i+1] as u64 + a[4*i+2] as u64 + a[4*i+3] as u64 + c[i] as u64; + let (v, o) = if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) }; + r[i] = v; sat |= o; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vsum4shs => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let c = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; let mut sat = false; + for i in 0..4 { + let s = a[2*i] as i64 + a[2*i+1] as i64 + c[i] as i64; + let (v, o) = crate::vmx::sat_i64_to_i32(s); + r[i] = v; sat |= o; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4c — VMX integer compares (all set 0xFF/0xFFFF/0xFFFFFFFF per lane) + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::vcmpequb => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = if a[i] == b[i] { 0xFF } else { 0 }; } + let v = xenia_types::Vec128::from_bytes(r); + if instr.rc_bit() { + let (t, f) = crate::vmx::cr6_flags_from_mask(v); + ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false }; + } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + PpcOpcode::vcmpequh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = if a[i] == b[i] { 0xFFFF } else { 0 }; } + let v = xenia_types::Vec128::from_u16x8_array(r); + if instr.rc_bit() { + let (t, f) = crate::vmx::cr6_flags_from_mask(v); + ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false }; + } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + PpcOpcode::vcmpgtub => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = if a[i] > b[i] { 0xFF } else { 0 }; } + let v = xenia_types::Vec128::from_bytes(r); + if instr.rc_bit() { + let (t, f) = crate::vmx::cr6_flags_from_mask(v); + ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false }; + } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + PpcOpcode::vcmpgtsb => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = if a[i] > b[i] { 0xFF } else { 0 }; } + let v = xenia_types::Vec128::from_bytes(r); + if instr.rc_bit() { + let (t, f) = crate::vmx::cr6_flags_from_mask(v); + ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false }; + } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + PpcOpcode::vcmpgtuh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = if a[i] > b[i] { 0xFFFF } else { 0 }; } + let v = xenia_types::Vec128::from_u16x8_array(r); + if instr.rc_bit() { + let (t, f) = crate::vmx::cr6_flags_from_mask(v); + ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false }; + } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + PpcOpcode::vcmpgtsh => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = if a[i] > b[i] { 0xFFFF } else { 0 }; } + let v = xenia_types::Vec128::from_u16x8_array(r); + if instr.rc_bit() { + let (t, f) = crate::vmx::cr6_flags_from_mask(v); + ctx.cr[6] = crate::context::CrField { lt: t, gt: false, eq: f, so: false }; + } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + PpcOpcode::vcmpgtuw => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = if a[i] > b[i] { 0xFFFFFFFF } else { 0 }; } + let v = xenia_types::Vec128::from_u32x4_array(r); + if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + PpcOpcode::vcmpgtsw => { + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = if a[i] > b[i] { 0xFFFFFFFF } else { 0 }; } + let v = xenia_types::Vec128::from_u32x4_array(r); + if instr.rc_bit() { update_cr6_from_vmask(&r, ctx); } + ctx.vr[instr.rd()] = v; + ctx.pc += 4; + } + // vcmpbfp(128): set upper/lower nibbles per lane based on bounds test. + PpcOpcode::vcmpbfp | PpcOpcode::vcmpbfp128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vcmpbfp128); + let (ra, rb, rd) = if is_128 { + (instr.va128(), instr.vb128(), instr.vd128()) + } else { + (instr.ra(), instr.rb(), instr.rd()) + }; + let a = ctx.vr[ra].as_f32x4(); + let b = ctx.vr[rb].as_f32x4(); + let mut r = [0u32; 4]; + let mut any_out = false; + for i in 0..4 { + let mut lane: u32 = 0; + if a[i].is_nan() || b[i].is_nan() || a[i] > b[i] { lane |= 0x8000_0000; any_out = true; } + if a[i].is_nan() || b[i].is_nan() || a[i] < -b[i] { lane |= 0x4000_0000; any_out = true; } + r[i] = lane; + } + if instr.rc_bit() { + ctx.cr[6] = crate::context::CrField { + lt: false, gt: false, eq: !any_out, so: false, + }; + } + ctx.vr[rd] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4d — VMX shifts and rotates + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::vslb => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = a[i] << (b[i] & 7); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vsrb => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = a[i] >> (b[i] & 7); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vsrab => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0i8; 16]; + for i in 0..16 { r[i] = a[i] >> (b[i] & 7); } + ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r); + ctx.pc += 4; + } + PpcOpcode::vrlb => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = a[i].rotate_left((b[i] & 7) as u32); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vslh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[i] << (b[i] & 0xF); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vsrh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[i] >> (b[i] & 0xF); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vsrah => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = a[i] >> (b[i] & 0xF); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vrlh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[i].rotate_left((b[i] & 0xF) as u32); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + // vslw / vsrw / vsraw / vrlw (word shifts) are implemented above via + // vmx_reg_triple — skip here. + + // Full 128-bit bit shifts (vsl/vsr): shift by the low 3 bits of vB[15]. + PpcOpcode::vsl => { + let a = u128::from_be_bytes(ctx.vr[instr.ra()].as_bytes()); + let shift = (ctx.vr[instr.rb()].as_bytes()[15] & 7) as u32; + let r = if shift == 0 { a } else { a << shift }; + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r.to_be_bytes()); + ctx.pc += 4; + } + PpcOpcode::vsr => { + let a = u128::from_be_bytes(ctx.vr[instr.ra()].as_bytes()); + let shift = (ctx.vr[instr.rb()].as_bytes()[15] & 7) as u32; + let r = if shift == 0 { a } else { a >> shift }; + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r.to_be_bytes()); + ctx.pc += 4; + } + // vslo/vsro: 128-bit octet (byte) shift. vB[15] & 0x78 gives bit count / 8 * 8. + PpcOpcode::vslo | PpcOpcode::vslo128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vslo128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = u128::from_be_bytes(ctx.vr[ra].as_bytes()); + let nbytes = ((ctx.vr[rb].as_bytes()[15] >> 3) & 0xF) as u32; + let r = if nbytes == 0 { a } else { a << (nbytes * 8) }; + ctx.vr[rd] = xenia_types::Vec128::from_bytes(r.to_be_bytes()); + ctx.pc += 4; + } + PpcOpcode::vsro | PpcOpcode::vsro128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vsro128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = u128::from_be_bytes(ctx.vr[ra].as_bytes()); + let nbytes = ((ctx.vr[rb].as_bytes()[15] >> 3) & 0xF) as u32; + let r = if nbytes == 0 { a } else { a >> (nbytes * 8) }; + ctx.vr[rd] = xenia_types::Vec128::from_bytes(r.to_be_bytes()); + ctx.pc += 4; + } + // vrlimi128: rotate-left-immediate then partial-merge into vD. + // Field layout (from canary ppc_decode_data.cc VX128_4): + // imm = bits(22..=23,28..=29) for shift, mask = bits(24..=27) + // Simplified semantics: r = vB rotated left by `shift` words, merged + // into vD using a per-word `mask` (mask bit N == 1 ⇒ use vD[N], else + // use rotated[N]). Titles generally use mask=0xF (copy-all) which + // makes this behave like a plain word rotate. + PpcOpcode::vrlimi128 => { + let shift = ((instr.raw >> 16) & 0x3) as usize; + let mask = (instr.raw >> 2) & 0xF; // VX128_4 "fmask" + let b = ctx.vr[instr.vb128()].as_u32x4(); + let d = ctx.vr[instr.vd128()].as_u32x4(); + let rot = [b[shift % 4], b[(shift + 1) % 4], b[(shift + 2) % 4], b[(shift + 3) % 4]]; + let mut r = [0u32; 4]; + for i in 0..4 { + // mask bit 3 corresponds to word 0 (BE-first). Use rot when + // the corresponding mask bit is set. + let use_rot = (mask >> (3 - i)) & 1 == 1; + r[i] = if use_rot { rot[i] } else { d[i] }; + } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4e — VMX merge (interleave high / low halves) + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::vmrghb => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..8 { r[2*i] = a[i]; r[2*i+1] = b[i]; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vmrglb => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..8 { r[2*i] = a[8+i]; r[2*i+1] = b[8+i]; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vmrghh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..4 { r[2*i] = a[i]; r[2*i+1] = b[i]; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vmrglh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..4 { r[2*i] = a[4+i]; r[2*i+1] = b[4+i]; } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4f — VMX pack / unpack (saturating and modulo + D3D + 5-6-5) + // ═════════════════════════════════════════════════════════════════ + // ---- Pack modulo (truncate) ---- + PpcOpcode::vpkuhum | PpcOpcode::vpkuhum128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkuhum128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = ctx.vr[ra].as_u16x8(); + let b = ctx.vr[rb].as_u16x8(); + let mut r = [0u8; 16]; + for i in 0..8 { r[i] = a[i] as u8; } + for i in 0..8 { r[8 + i] = b[i] as u8; } + ctx.vr[rd] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vpkuwum | PpcOpcode::vpkuwum128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkuwum128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = ctx.vr[ra].as_u32x4(); + let b = ctx.vr[rb].as_u32x4(); + let mut r = [0u16; 8]; + for i in 0..4 { r[i] = a[i] as u16; } + for i in 0..4 { r[4 + i] = b[i] as u16; } + ctx.vr[rd] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + // ---- Pack with saturation ---- + PpcOpcode::vpkuhus | PpcOpcode::vpkuhus128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkuhus128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = ctx.vr[ra].as_u16x8(); + let b = ctx.vr[rb].as_u16x8(); + let mut r = [0u8; 16]; let mut sat = false; + for i in 0..8 { let (v, s) = crate::vmx::sat_u16_to_u8(a[i]); r[i] = v; sat |= s; } + for i in 0..8 { let (v, s) = crate::vmx::sat_u16_to_u8(b[i]); r[8 + i] = v; sat |= s; } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[rd] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vpkshus | PpcOpcode::vpkshus128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkshus128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = crate::vmx::as_i16x8(ctx.vr[ra]); + let b = crate::vmx::as_i16x8(ctx.vr[rb]); + let mut r = [0u8; 16]; let mut sat = false; + for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_u8(a[i]); r[i] = v; sat |= s; } + for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_u8(b[i]); r[8 + i] = v; sat |= s; } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[rd] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vpkshss | PpcOpcode::vpkshss128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkshss128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = crate::vmx::as_i16x8(ctx.vr[ra]); + let b = crate::vmx::as_i16x8(ctx.vr[rb]); + let mut r = [0i8; 16]; let mut sat = false; + for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_i8(a[i]); r[i] = v; sat |= s; } + for i in 0..8 { let (v, s) = crate::vmx::sat_i16_to_i8(b[i]); r[8 + i] = v; sat |= s; } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[rd] = crate::vmx::from_i8x16(r); + ctx.pc += 4; + } + PpcOpcode::vpkuwus | PpcOpcode::vpkuwus128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkuwus128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = ctx.vr[ra].as_u32x4(); + let b = ctx.vr[rb].as_u32x4(); + let mut r = [0u16; 8]; let mut sat = false; + for i in 0..4 { let (v, s) = crate::vmx::sat_u32_to_u16(a[i]); r[i] = v; sat |= s; } + for i in 0..4 { let (v, s) = crate::vmx::sat_u32_to_u16(b[i]); r[4 + i] = v; sat |= s; } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[rd] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vpkswus | PpcOpcode::vpkswus128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkswus128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = crate::vmx::as_i32x4(ctx.vr[ra]); + let b = crate::vmx::as_i32x4(ctx.vr[rb]); + let mut r = [0u16; 8]; let mut sat = false; + for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_u16(a[i]); r[i] = v; sat |= s; } + for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_u16(b[i]); r[4 + i] = v; sat |= s; } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[rd] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vpkswss | PpcOpcode::vpkswss128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vpkswss128); + let (ra, rb, rd) = if is_128 { (instr.va128(), instr.vb128(), instr.vd128()) } + else { (instr.ra(), instr.rb(), instr.rd()) }; + let a = crate::vmx::as_i32x4(ctx.vr[ra]); + let b = crate::vmx::as_i32x4(ctx.vr[rb]); + let mut r = [0i16; 8]; let mut sat = false; + for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_i16(a[i]); r[i] = v; sat |= s; } + for i in 0..4 { let (v, s) = crate::vmx::sat_i32_to_i16(b[i]); r[4 + i] = v; sat |= s; } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[rd] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + // vpkpx: pack two u32 vectors into one u16 (5-5-5 pixel) vector. + PpcOpcode::vpkpx => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u16; 8]; + for i in 0..4 { r[i] = crate::vmx::pack_pixel_555(a[i]); } + for i in 0..4 { r[4 + i] = crate::vmx::pack_pixel_555(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + + // ---- Unpack (sign-extend) ---- + PpcOpcode::vupkhsb | PpcOpcode::vupkhsb128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vupkhsb128); + let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) } + else { (instr.rb(), instr.rd()) }; + let b = crate::vmx::as_i8x16(ctx.vr[rb]); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = b[i] as i16; } + ctx.vr[rd] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vupklsb | PpcOpcode::vupklsb128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vupklsb128); + let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) } + else { (instr.rb(), instr.rd()) }; + let b = crate::vmx::as_i8x16(ctx.vr[rb]); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = b[8 + i] as i16; } + ctx.vr[rd] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vupkhsh => { + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; + for i in 0..4 { r[i] = b[i] as i32; } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vupklsh => { + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; + for i in 0..4 { r[i] = b[4 + i] as i32; } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vupkhpx => { + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = crate::vmx::unpack_pixel_555(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vupklpx => { + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = crate::vmx::unpack_pixel_555(b[4 + i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + + // ---- D3D pack / unpack (VMX128-only) ---- + // + // First-Pixels M3: fixed immediate extraction + added pack types + // 1-6. The prior `(instr.raw >> 6) & 0x7` was LSB-numbered (wrong + // position) and masked to only 3 bits. Canary extracts from the + // VX128_3/4 `IMM` field at PPC bits 16-22 (MSB) and does + // `type = IMM >> 2` to pick up the 5-bit type selector — the low + // 2 bits (`pack`) select output-slot layout for `vpkd3d128` and + // are ignored by `vupkd3d128`. Extracting the low 2 bits as + // `pack` (unused here — we hand back the codec output in its + // canonical lane position, the subsequent permute instruction + // handles placement) for completeness. + PpcOpcode::vpkd3d128 => { + use crate::vmx::D3dPackType; + let uimm = crate::decoder::extract_vx128_uimm5(instr.raw); + let ty = D3dPackType::from_immediate(uimm >> 2); + let src = ctx.vr[instr.vb128()]; + let out = match ty { + D3dPackType::D3dColor => crate::vmx::pack_d3dcolor(src), + D3dPackType::NormShort2 => crate::vmx::pack_normshort2(src), + D3dPackType::NormPacked32 => crate::vmx::pack_normpacked32(src), + D3dPackType::Float16_2 => crate::vmx::pack_float16_2(src), + D3dPackType::NormShort4 => crate::vmx::pack_normshort4(src), + D3dPackType::Float16_4 => crate::vmx::pack_float16_4(src), + D3dPackType::NormPacked64 => crate::vmx::pack_normpacked64(src), + D3dPackType::Other(t) => { + tracing::warn!( + raw = format_args!("{:#010x}", instr.raw), + uimm, + ty = t, + "vpkd3d128: unhandled pack type at {:#010x}", + ctx.pc, + ); + src + } + }; + ctx.vr[instr.vd128()] = out; + ctx.pc += 4; + } + PpcOpcode::vupkd3d128 => { + use crate::vmx::D3dPackType; + let uimm = crate::decoder::extract_vx128_uimm5(instr.raw); + let ty = D3dPackType::from_immediate(uimm >> 2); + let src = ctx.vr[instr.vb128()]; + let out = match ty { + D3dPackType::D3dColor => crate::vmx::unpack_d3dcolor(src), + D3dPackType::NormShort2 => crate::vmx::unpack_normshort2(src), + D3dPackType::NormPacked32 => crate::vmx::unpack_normpacked32(src), + D3dPackType::Float16_2 => crate::vmx::unpack_float16_2(src), + D3dPackType::NormShort4 => crate::vmx::unpack_normshort4(src), + D3dPackType::Float16_4 => crate::vmx::unpack_float16_4(src), + D3dPackType::NormPacked64 => crate::vmx::unpack_normpacked64(src), + D3dPackType::Other(t) => { + tracing::warn!( + raw = format_args!("{:#010x}", instr.raw), + uimm, + ty = t, + "vupkd3d128: unhandled pack type at {:#010x}", + ctx.pc, + ); + src + } + }; + ctx.vr[instr.vd128()] = out; + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4g — VMX convert (float ↔ fixed-point) + // ═════════════════════════════════════════════════════════════════ + // vctsxs / vctuxs: f32 → i32/u32, scaled by 2^uimm, saturating. + PpcOpcode::vctsxs => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = ctx.vr[instr.rb()].as_f32x4(); + let mut r = [0i32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::cvt_f32_to_i32_sat(b[i], uimm); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vctuxs => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = ctx.vr[instr.rb()].as_f32x4(); + let mut r = [0u32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::cvt_f32_to_u32_sat(b[i], uimm); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + // vcfsx / vcfux: i32/u32 → f32, scaled by 2^-uimm. + PpcOpcode::vcfsx => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0f32; 4]; + for i in 0..4 { r[i] = crate::vmx::cvt_i32_to_f32(b[i], uimm); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vcfux => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0f32; 4]; + for i in 0..4 { r[i] = crate::vmx::cvt_u32_to_f32(b[i], uimm); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + // VMX128 convert variants. uimm lives in bits 16-20 of the encoded form. + PpcOpcode::vcfpsxws128 => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = ctx.vr[instr.vb128()].as_f32x4(); + let mut r = [0i32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::cvt_f32_to_i32_sat(b[i], uimm); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.vd128()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vcfpuxws128 => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = ctx.vr[instr.vb128()].as_f32x4(); + let mut r = [0u32; 4]; let mut sat = false; + for i in 0..4 { + let (v, s) = crate::vmx::cvt_f32_to_u32_sat(b[i], uimm); + r[i] = v; sat |= s; + } + if sat { ctx.set_vscr_sat(true); } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vcsxwfp128 => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = crate::vmx::as_i32x4(ctx.vr[instr.vb128()]); + let mut r = [0f32; 4]; + for i in 0..4 { r[i] = crate::vmx::cvt_i32_to_f32(b[i], uimm); } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vcuxwfp128 => { + let uimm = (instr.raw >> 16) & 0x1F; + let b = ctx.vr[instr.vb128()].as_u32x4(); + let mut r = [0f32; 4]; + for i in 0..4 { r[i] = crate::vmx::cvt_u32_to_f32(b[i], uimm); } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4h — VMX vector FPU (exp / log) + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::vexptefp | PpcOpcode::vexptefp128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vexptefp128); + let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) } + else { (instr.rb(), instr.rd()) }; + let b = ctx.vr[rb].as_f32x4(); + let mut r = [0f32; 4]; + for i in 0..4 { r[i] = b[i].exp2(); } + ctx.vr[rd] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vlogefp | PpcOpcode::vlogefp128 => { + let is_128 = matches!(instr.opcode, PpcOpcode::vlogefp128); + let (rb, rd) = if is_128 { (instr.vb128(), instr.vd128()) } + else { (instr.rb(), instr.rd()) }; + let b = ctx.vr[rb].as_f32x4(); + let mut r = [0f32; 4]; + for i in 0..4 { r[i] = b[i].log2(); } + ctx.vr[rd] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4i — VMX integer max / min + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::vmaxub => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = a[i].max(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vminub => { + let a = ctx.vr[instr.ra()].as_bytes(); + let b = ctx.vr[instr.rb()].as_bytes(); + let mut r = [0u8; 16]; + for i in 0..16 { r[i] = a[i].min(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_bytes(r); + ctx.pc += 4; + } + PpcOpcode::vmaxsb => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0i8; 16]; + for i in 0..16 { r[i] = a[i].max(b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r); + ctx.pc += 4; + } + PpcOpcode::vminsb => { + let a = crate::vmx::as_i8x16(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i8x16(ctx.vr[instr.rb()]); + let mut r = [0i8; 16]; + for i in 0..16 { r[i] = a[i].min(b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i8x16(r); + ctx.pc += 4; + } + PpcOpcode::vmaxuh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[i].max(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vminuh => { + let a = ctx.vr[instr.ra()].as_u16x8(); + let b = ctx.vr[instr.rb()].as_u16x8(); + let mut r = [0u16; 8]; + for i in 0..8 { r[i] = a[i].min(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u16x8_array(r); + ctx.pc += 4; + } + PpcOpcode::vmaxsh => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = a[i].max(b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vminsh => { + let a = crate::vmx::as_i16x8(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i16x8(ctx.vr[instr.rb()]); + let mut r = [0i16; 8]; + for i in 0..8 { r[i] = a[i].min(b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i16x8(r); + ctx.pc += 4; + } + PpcOpcode::vmaxuw => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = a[i].max(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vminuw => { + let a = ctx.vr[instr.ra()].as_u32x4(); + let b = ctx.vr[instr.rb()].as_u32x4(); + let mut r = [0u32; 4]; + for i in 0..4 { r[i] = a[i].min(b[i]); } + ctx.vr[instr.rd()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + PpcOpcode::vmaxsw => { + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; + for i in 0..4 { r[i] = a[i].max(b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + PpcOpcode::vminsw => { + let a = crate::vmx::as_i32x4(ctx.vr[instr.ra()]); + let b = crate::vmx::as_i32x4(ctx.vr[instr.rb()]); + let mut r = [0i32; 4]; + for i in 0..4 { r[i] = a[i].min(b[i]); } + ctx.vr[instr.rd()] = crate::vmx::from_i32x4(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4j — VMX128 FMA / permute + // ═════════════════════════════════════════════════════════════════ + // vmaddcfp128: vD = vD * vB + vA (using vD's current value as accumulator) + PpcOpcode::vmaddcfp128 => { + // Xbox-360-specific: vD = (vD * vB) + vA. Note the VD-reuse: VD is both + // a source operand (as multiplicand) and the destination. Canary & + // POWER8 hardware confirm denormal inputs are flushed regardless of NJ. + let a = ctx.vr[instr.va128()].as_f32x4(); + let b = ctx.vr[instr.vb128()].as_f32x4(); + let d = ctx.vr[instr.vd128()].as_f32x4(); + let mut r = [0f32; 4]; + for i in 0..4 { + let ai = vmx::flush_denorm(a[i]); + let bi = vmx::flush_denorm(b[i]); + let di = vmx::flush_denorm(d[i]); + r[i] = di.mul_add(bi, ai); + } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4_array(r); + ctx.pc += 4; + } + // vmsum3fp128: horizontal sum of (vA * vB) over lanes 0..3, broadcast to all 4 output lanes. + // Canary `InstrEmit_vmsum3fp128` flushes the *output* denormal + // unconditionally (not the inputs) — see ppc_emit_altivec.cc:1067-1075. + PpcOpcode::vmsum3fp128 => { + let a = ctx.vr[instr.va128()].as_f32x4(); + let b = ctx.vr[instr.vb128()].as_f32x4(); + let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2]); + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s); + ctx.pc += 4; + } + PpcOpcode::vmsum4fp128 => { + let a = ctx.vr[instr.va128()].as_f32x4(); + let b = ctx.vr[instr.vb128()].as_f32x4(); + let s = vmx::flush_denorm(a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]); + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_f32x4(s, s, s, s); + ctx.pc += 4; + } + // vpermwi128: permute words of vB using an 8-bit immediate (2 bits per output lane). + PpcOpcode::vpermwi128 => { + let imm = (instr.raw >> 16) & 0xFF; + let b = ctx.vr[instr.vb128()].as_u32x4(); + let mut r = [0u32; 4]; + // Output lane i ← b[(imm >> (2 * (3-i))) & 3] + for i in 0..4 { + let sel = ((imm >> (2 * (3 - i))) & 3) as usize; + r[i] = b[sel]; + } + ctx.vr[instr.vd128()] = xenia_types::Vec128::from_u32x4_array(r); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4k — Scalar reservation / byte-reverse (doubleword) + // ═════════════════════════════════════════════════════════════════ + // M3.7 — same table-vs-legacy split as lwarx/stwcx. + PpcOpcode::ldarx => { + let ea = ea_indexed(ctx, instr); + let val = mem.read_u64(ea); + ctx.gpr[instr.rd()] = val; + ctx.reserved_line = ea & !RESERVATION_MASK; + ctx.reserved_val = val; + ctx.has_reservation = true; + if let Some(t) = &ctx.reservation_table { + if t.is_enabled() { + ctx.reserved_generation = t.reserve(ea, ctx.hw_id); + } + } + ctx.pc += 4; + } + PpcOpcode::stdcx => { + let ea = ea_indexed(ctx, instr); + let line = ea & !RESERVATION_MASK; + let table_route = ctx + .reservation_table + .as_ref() + .filter(|t| t.is_enabled()) + .cloned(); + let success = if let Some(t) = &table_route { + ctx.has_reservation + && ctx.reserved_line == line + && t.try_commit(ea, ctx.reserved_generation, ctx.hw_id) + } else { + ctx.has_reservation && ctx.reserved_line == line + }; + if success { + mem.write_u64(ea, ctx.gpr[instr.rs()]); + ctx.cr[0] = crate::context::CrField { + lt: false, + gt: false, + eq: true, + so: ctx.xer_so != 0, + }; + } else { + ctx.cr[0] = crate::context::CrField { + lt: false, + gt: false, + eq: false, + so: ctx.xer_so != 0, + }; + if let Some(t) = &table_route { + t.release(ea, ctx.reserved_generation, ctx.hw_id); + } + } + ctx.has_reservation = false; + ctx.pc += 4; + } + PpcOpcode::ldbrx => { + let ea = ea_indexed(ctx, instr); + ctx.gpr[instr.rd()] = mem.read_u64(ea).swap_bytes(); + ctx.pc += 4; + } + PpcOpcode::stdbrx => { + let ea = ea_indexed(ctx, instr); + mem.write_u64(ea, ctx.gpr[instr.rs()].swap_bytes()); + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4l — Scalar string load / store (register-length) + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::lswx => { + let mut ea = ea_indexed(ctx, instr); + let nb = ctx.xer() & 0x7F; // XER[25..31] + let mut rd = instr.rd(); + let mut bytes_left = nb; + while bytes_left > 0 { + let mut val = 0u32; + for byte_idx in 0..4 { + if bytes_left == 0 { break; } + let b = mem.read_u8(ea) as u32; + val |= b << (24 - byte_idx * 8); + ea = ea.wrapping_add(1); + bytes_left -= 1; + } + ctx.gpr[rd] = val as u64; + rd = (rd + 1) % 32; + } + ctx.pc += 4; + } + PpcOpcode::stswx => { + let mut ea = ea_indexed(ctx, instr); + let nb = ctx.xer() & 0x7F; + let mut rs = instr.rs(); + let mut bytes_left = nb; + while bytes_left > 0 { + let val = ctx.gpr[rs] as u32; + for byte_idx in 0..4 { + if bytes_left == 0 { break; } + mem.write_u8(ea, (val >> (24 - byte_idx * 8)) as u8); + ea = ea.wrapping_add(1); + bytes_left -= 1; + } + rs = (rs + 1) % 32; + } + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // §4m — mcrxr: move XER condition bits to CR field, clear XER[SO/OV/CA] + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::mcrxr => { + let crfd = instr.crfd(); + ctx.cr[crfd] = crate::context::CrField { + lt: ctx.xer_so != 0, + gt: ctx.xer_ov != 0, + eq: ctx.xer_ca != 0, + so: false, + }; + ctx.xer_so = 0; + ctx.xer_ov = 0; + ctx.xer_ca = 0; + ctx.pc += 4; + } + + // ═════════════════════════════════════════════════════════════════ + // mcrfs — move FPSCR field to CR field and clear corresponding + // FPSCR exception bits. CR field crfD ← FPSCR[(crfS*4)..(crfS*4+3)] + // and then FPSCR bits in that nibble that are exception bits are + // cleared (FX, OX, UX, ZX, XX, VXSNAN, VXISI, VXIDI, VXZDZ, VXIMZ, + // VXVC, VXSOFT, VXSQRT, VXCVI are cleared; FEX/VX are read-only + // summaries and are recomputed later). + // ═════════════════════════════════════════════════════════════════ + PpcOpcode::mcrfs => { + let crfd = instr.crfd(); + let crfs = instr.crfs(); + let shift = 28 - (crfs as u32 * 4); + let nibble = ((ctx.fpscr >> shift) & 0xF) as u8; + ctx.cr[crfd] = crate::context::CrField::from_u8(nibble); + // Clearable exception bits: 0 (FX), 3 (OX), 4 (UX), 5 (ZX), + // 6 (XX), 7 (VXSNAN), 8 (VXISI), 9 (VXIDI), 10 (VXZDZ), + // 11 (VXIMZ), 12 (VXVC), 21 (VXSOFT), 22 (VXSQRT), 23 (VXCVI). + // (Bit positions are PowerISA MSB-0; here 'FPSCR bit n' means + // the bit at (31-n) in our little-endian u32.) + const CLEARABLE_MASK: u32 = + (1 << 31) | (1 << (31 - 3)) | (1 << (31 - 4)) | + (1 << (31 - 5)) | (1 << (31 - 6)) | (1 << (31 - 7)) | + (1 << (31 - 8)) | (1 << (31 - 9)) | (1 << (31 - 10)) | + (1 << (31 - 11)) | (1 << (31 - 12)) | + (1 << (31 - 21)) | (1 << (31 - 22)) | (1 << (31 - 23)); + let nibble_mask = 0xFu32 << shift; + ctx.fpscr &= !(nibble_mask & CLEARABLE_MASK); + ctx.pc += 4; + } + // Anything not yet implemented _ => { tracing::warn!("Unimplemented opcode at {:#010x}: {:?} [{:08X}]", ctx.pc, instr.opcode, instr.raw); @@ -2212,6 +4261,13 @@ fn execute(ctx: &mut PpcContext, mem: &mut dyn MemoryAccess, instr: &DecodedInst StepResult::Continue } +/// Compute an X-form indexed effective address: EA = (rA==0 ? 0 : GPR[rA]) + GPR[rB]. +#[inline] +fn ea_indexed(ctx: &PpcContext, instr: &DecodedInstr) -> u32 { + let a = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] }; + a.wrapping_add(ctx.gpr[instr.rb()]) as u32 +} + /// Helper for CR logical operations. fn cr_logical(ctx: &mut PpcContext, instr: &DecodedInstr, op: fn(bool, bool) -> bool) { let a = ctx.get_cr_bit(instr.crba()); @@ -2274,70 +4330,84 @@ fn update_cr6_from_vmask(r: &[u32; 4], ctx: &mut PpcContext) { /// Round a double to single precision and back (matches xenia's ToSingle). #[inline] -fn to_single(val: f64) -> f64 { - val as f32 as f64 +/// Round an f64 to single precision, honouring FPSCR[RN]. +fn to_single(ctx: &PpcContext, val: f64) -> f64 { + fpscr::round_to_single(ctx, val) } /// Update CR1 from FPSCR (used when Rc=1 on FPU instructions). /// CR1 = FPSCR[FX, FEX, VX, OX] (bits 0-3). #[inline] fn update_cr1_from_fpscr(ctx: &mut PpcContext) { - ctx.cr[1].lt = (ctx.fpscr >> 31) & 1 != 0; // FX - ctx.cr[1].gt = (ctx.fpscr >> 30) & 1 != 0; // FEX - ctx.cr[1].eq = (ctx.fpscr >> 29) & 1 != 0; // VX - ctx.cr[1].so = (ctx.fpscr >> 28) & 1 != 0; // OX + fpscr::update_cr1(ctx); } #[cfg(test)] mod tests { use super::*; - /// Simple test memory (64KB) + /// Simple test memory (64KB). Backed by `Box<[Cell]>` so the + /// MemoryAccess writes can take `&self`. struct TestMem { - data: Vec, + data: Box<[std::cell::Cell]>, } impl TestMem { fn new() -> Self { - Self { data: vec![0; 65536] } + Self { + data: (0..65536u32).map(|_| std::cell::Cell::new(0)).collect(), + } } } impl MemoryAccess for TestMem { - fn read_u8(&self, addr: u32) -> u8 { self.data[addr as usize] } + fn read_u8(&self, addr: u32) -> u8 { self.data[addr as usize].get() } fn read_u16(&self, addr: u32) -> u16 { let a = addr as usize; - u16::from_be_bytes([self.data[a], self.data[a+1]]) + u16::from_be_bytes([self.data[a].get(), self.data[a+1].get()]) } fn read_u32(&self, addr: u32) -> u32 { let a = addr as usize; - u32::from_be_bytes([self.data[a], self.data[a+1], self.data[a+2], self.data[a+3]]) + u32::from_be_bytes([ + self.data[a].get(), self.data[a+1].get(), + self.data[a+2].get(), self.data[a+3].get(), + ]) } fn read_u64(&self, addr: u32) -> u64 { let a = addr as usize; u64::from_be_bytes([ - self.data[a], self.data[a+1], self.data[a+2], self.data[a+3], - self.data[a+4], self.data[a+5], self.data[a+6], self.data[a+7], + self.data[a].get(), self.data[a+1].get(), + self.data[a+2].get(), self.data[a+3].get(), + self.data[a+4].get(), self.data[a+5].get(), + self.data[a+6].get(), self.data[a+7].get(), ]) } - fn write_u8(&mut self, addr: u32, val: u8) { self.data[addr as usize] = val; } - fn write_u16(&mut self, addr: u32, val: u16) { + fn write_u8(&self, addr: u32, val: u8) { self.data[addr as usize].set(val); } + fn write_u16(&self, addr: u32, val: u16) { let a = addr as usize; - self.data[a..a+2].copy_from_slice(&val.to_be_bytes()); + let bytes = val.to_be_bytes(); + self.data[a].set(bytes[0]); + self.data[a+1].set(bytes[1]); } - fn write_u32(&mut self, addr: u32, val: u32) { + fn write_u32(&self, addr: u32, val: u32) { let a = addr as usize; - self.data[a..a+4].copy_from_slice(&val.to_be_bytes()); + let bytes = val.to_be_bytes(); + for (i, b) in bytes.iter().enumerate() { + self.data[a+i].set(*b); + } } - fn write_u64(&mut self, addr: u32, val: u64) { + fn write_u64(&self, addr: u32, val: u64) { let a = addr as usize; - self.data[a..a+8].copy_from_slice(&val.to_be_bytes()); + let bytes = val.to_be_bytes(); + for (i, b) in bytes.iter().enumerate() { + self.data[a+i].set(*b); + } } fn translate(&self, _addr: u32) -> Option<*const u8> { None } - fn translate_mut(&mut self, _addr: u32) -> Option<*mut u8> { None } + fn translate_mut(&self, _addr: u32) -> Option<*mut u8> { None } } - fn write_instr(mem: &mut TestMem, addr: u32, raw: u32) { + fn write_instr(mem: &TestMem, addr: u32, raw: u32) { mem.write_u32(addr, raw); } @@ -2526,4 +4596,957 @@ mod tests { // (2.0 * 5.0) + 3.0 = 13.0 assert!((ctx.fpr[4] - 13.0).abs() < 1e-10); } + + #[test] + fn test_ctx_default_state_matches_canary() { + let ctx = PpcContext::new(); + // LR initialized to halt sentinel so a top-level blr drops out cleanly. + assert_eq!(ctx.lr, crate::context::LR_HALT_SENTINEL); + // VSCR starts with NJ bit set (denormals flush to zero). + assert!(ctx.vscr_nj()); + assert!(!ctx.vscr_sat()); + // VRSAVE defaults to "save all" per canary. + assert_eq!(ctx.vrsave, 0xFFFF_FFFF); + } + + #[test] + fn test_vaddubs_saturates_and_sets_vscr_sat() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // Fill vA with 0xF0, vB with 0x20 → 0x110, saturates to 0xFF per lane. + ctx.vr[2] = xenia_types::Vec128::from_bytes([0xF0; 16]); + ctx.vr[3] = xenia_types::Vec128::from_bytes([0x20; 16]); + // vaddubs vD=4, vA=2, vB=3. XO=512 (PPC: opcode 4, VA-form). + let raw: u32 = (4u32 << 26) | (4u32 << 21) | (2u32 << 16) | (3u32 << 11) | 512u32; + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + let r = step(&mut ctx, &mut mem); + assert_eq!(r, StepResult::Continue); + assert_eq!(ctx.vr[4].as_bytes(), [0xFFu8; 16]); + assert!(ctx.vscr_sat(), "SAT should be set after saturation"); + } + + #[test] + fn test_ldarx_stdcx_pair() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + mem.write_u64(0x1000, 0xDEADBEEF_CAFEBABE); + ctx.gpr[4] = 0x1000; + ctx.gpr[5] = 0; + // ldarx r3, r4, r5: (31 << 26) | (3<<21) | (4<<16) | (5<<11) | (84<<1) + let raw_ld: u32 = (31u32 << 26) | (3u32 << 21) | (4u32 << 16) | (5u32 << 11) | (84u32 << 1); + write_instr(&mut mem, 0, raw_ld); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.gpr[3], 0xDEADBEEF_CAFEBABE); + assert!(ctx.has_reservation); + // stdcx. r6, r4, r5: (31 << 26) | (6<<21) | (4<<16) | (5<<11) | (214<<1) | 1 + ctx.gpr[6] = 0x1111_1111_2222_2222; + let raw_st: u32 = (31u32 << 26) | (6u32 << 21) | (4u32 << 16) | (5u32 << 11) | (214u32 << 1) | 1; + write_instr(&mut mem, 4, raw_st); + step(&mut ctx, &mut mem); + assert!(ctx.cr[0].eq, "stdcx. should succeed and set CR0.EQ"); + assert_eq!(mem.read_u64(0x1000), 0x1111_1111_2222_2222); + assert!(!ctx.has_reservation); + } + + #[test] + fn test_mcrxr_moves_xer_condition_bits_and_clears_them() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.xer_so = 1; + ctx.xer_ov = 0; + ctx.xer_ca = 1; + // mcrxr crfD=3: (31 << 26) | (3<<23) | (512<<1) + let raw: u32 = (31u32 << 26) | (3u32 << 23) | (512u32 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert!(ctx.cr[3].lt, "LT should mirror old XER[SO]"); + assert!(!ctx.cr[3].gt, "GT should mirror old XER[OV]"); + assert!(ctx.cr[3].eq, "EQ should mirror old XER[CA]"); + assert_eq!(ctx.xer_so, 0); + assert_eq!(ctx.xer_ov, 0); + assert_eq!(ctx.xer_ca, 0); + } + + // ---------- Phase 2 fixes: OE / overflow ---------- + + fn addx_raw(rd: u32, ra: u32, rb: u32, oe: bool, rc: bool) -> u32 { + (31 << 26) | (rd << 21) | (ra << 16) | (rb << 11) + | ((oe as u32) << 10) | (266 << 1) | (rc as u32) + } + + #[test] + fn addo_sets_xer_ov_on_signed_overflow_and_stickies_so() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = i64::MAX as u64; + ctx.gpr[4] = 1; + write_instr(&mut mem, 0, addx_raw(5, 3, 4, true, false)); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.gpr[5], i64::MIN as u64); + assert_eq!(ctx.xer_ov, 1, "OV must be set on signed overflow"); + assert_eq!(ctx.xer_so, 1, "SO must be stickied from OV"); + } + + #[test] + fn addo_clears_xer_ov_when_no_overflow_but_keeps_sticky_so() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.xer_ov = 1; // stale from a previous overflow + ctx.xer_so = 1; + ctx.gpr[3] = 1; + ctx.gpr[4] = 2; + write_instr(&mut mem, 0, addx_raw(5, 3, 4, true, false)); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.gpr[5], 3); + assert_eq!(ctx.xer_ov, 0, "OV must clear when no overflow"); + assert_eq!(ctx.xer_so, 1, "SO is sticky; stays set"); + } + + #[test] + fn add_without_oe_does_not_touch_xer() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = i64::MAX as u64; + ctx.gpr[4] = 1; + write_instr(&mut mem, 0, addx_raw(5, 3, 4, false, false)); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.xer_ov, 0); + assert_eq!(ctx.xer_so, 0); + } + + #[test] + fn addx_rc_uses_64bit_compare_not_32bit() { + // r3 = 0x0000_0000_FFFF_FFFF, r4 = 0 → result = 0x0000_0000_FFFF_FFFF. + // As i32 this is -1 (lt). As i64 this is positive (gt). Spec says 64-bit. + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = 0x0000_0000_FFFF_FFFF; + ctx.gpr[4] = 0; + write_instr(&mut mem, 0, addx_raw(5, 3, 4, false, true)); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.gpr[5], 0x0000_0000_FFFF_FFFF); + assert!(!ctx.cr[0].lt, "64-bit compare: value is positive, not negative"); + assert!(ctx.cr[0].gt); + assert!(!ctx.cr[0].eq); + } + + #[test] + fn subfo_sets_xer_ov_on_min_minus_one() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // subfo r5, r3, r4 -> r5 = r4 - r3 + // r4 = INT64_MIN, r3 = 1 -> result overflows + ctx.gpr[3] = 1; + ctx.gpr[4] = i64::MIN as u64; + let raw = (31 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (1 << 10) | (40 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.xer_ov, 1); + assert_eq!(ctx.xer_so, 1); + } + + #[test] + fn mullwo_sets_xer_ov_when_product_overflows_32_bits() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // mullwo r5, r3, r4 (XO=235, OE=1) + ctx.gpr[3] = i32::MAX as u64; + ctx.gpr[4] = 2u64; + let raw = (31 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (1 << 10) | (235 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.xer_ov, 1); + assert_eq!(ctx.xer_so, 1); + } + + #[test] + fn divwo_sets_xer_ov_on_divide_by_zero() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // divwo r5, r3, r4 (XO=491, OE=1) + ctx.gpr[3] = 10; + ctx.gpr[4] = 0; + let raw = (31 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (1 << 10) | (491 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.xer_ov, 1); + assert_eq!(ctx.gpr[5], 0); // undefined in spec; canary uses 0 + } + + #[test] + fn nego_sets_ov_only_on_int_min() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // nego r5, r3 (XO=104, OE=1) + ctx.gpr[3] = i64::MIN as u64; + let raw = (31 << 26) | (5 << 21) | (3 << 16) | (1 << 10) | (104 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.xer_ov, 1); + // -INT_MIN wraps to INT_MIN + assert_eq!(ctx.gpr[5], i64::MIN as u64); + } + + // ---------- Phase 2 fixes: trap TO-field ---------- + + #[test] + fn tw_with_to_zero_never_fires() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = 5; + ctx.gpr[4] = 5; + // tw 0, r3, r4 (XO=4). TO in bits 6-10. + let raw = (31 << 26) | (0 << 21) | (3 << 16) | (4 << 11) | (4 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + let r = step(&mut ctx, &mut mem); + assert_eq!(r, StepResult::Continue, "TO=0 must never trap"); + assert_eq!(ctx.pc, 4); + } + + #[test] + fn tw_eq_fires_on_equal() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = 5; + ctx.gpr[4] = 5; + // TO=4 (EQ only) + let raw = (31 << 26) | (4 << 21) | (3 << 16) | (4 << 11) | (4 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + let r = step(&mut ctx, &mut mem); + assert_eq!(r, StepResult::Trap); + } + + #[test] + fn tw_eq_does_not_fire_on_unequal() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = 5; + ctx.gpr[4] = 7; + // TO=4 (EQ only) + let raw = (31 << 26) | (4 << 21) | (3 << 16) | (4 << 11) | (4 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + let r = step(&mut ctx, &mut mem); + assert_eq!(r, StepResult::Continue); + } + + #[test] + fn twi_compares_low_32_bits_only() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[3] = 0xDEAD_BEEF_0000_0005; // low 32 = 5 + // twi 4, r3, 5: primary=3, TO=4, RA=3, SI=5 + let raw = (3 << 26) | (4 << 21) | (3 << 16) | (5u32 & 0xFFFF); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + let r = step(&mut ctx, &mut mem); + assert_eq!(r, StepResult::Trap, "word-width compare matches low 32"); + } + + // ---------- Phase 2 fixes: mcrfs ---------- + + // ---------- Phase 2h: FPU / FPSCR ---------- + + #[test] + fn fadd_inf_minus_inf_sets_vxisi() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.fpr[1] = f64::INFINITY; + ctx.fpr[2] = f64::NEG_INFINITY; + // fadd f3, f1, f2 → inf + (-inf) = VXISI + let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (21 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // VXISI bit is PPC bit 8 → mask 1<<23 + assert_ne!(ctx.fpscr & fpscr::VXISI, 0); + // FX sticky is set on any new exception → mask 1<<31 + assert_ne!(ctx.fpscr & fpscr::FX, 0); + // VX summary set → 1<<29 + assert_ne!(ctx.fpscr & fpscr::VX, 0); + } + + #[test] + fn fdiv_zero_over_zero_sets_vxzdz() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.fpr[1] = 0.0; + ctx.fpr[2] = 0.0; + // fdiv f3, f1, f2 (opcode 63, subop 18) + let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (18 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_ne!(ctx.fpscr & fpscr::VXZDZ, 0); + } + + #[test] + fn fdiv_finite_over_zero_sets_zx() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.fpr[1] = 1.0; + ctx.fpr[2] = 0.0; + // fdiv f3, f1, f2 + let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (18 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_ne!(ctx.fpscr & fpscr::ZX, 0); + } + + #[test] + fn fadd_sets_fprf_from_result() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.fpr[1] = 2.5; + ctx.fpr[2] = 3.5; + let raw = (63u32 << 26) | (3 << 21) | (1 << 16) | (2 << 11) | (21 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // Result = +6.0 → FPRF = POS_NORMAL = 0b0_0100 + let fprf = ((ctx.fpscr & fpscr::FPRF_MASK) >> 12) as u8; + assert_eq!(fprf, fpscr::fprf::POS_NORMAL); + } + + #[test] + fn frsp_honours_fpscr_rn_toward_zero() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // A value whose nearest-even rounding would go up but trunc goes down. + // Choose v = 1 + 0x1.00_0001_ * 2^-24-ish so low bit is 1 in the f32 mantissa. + let v = f64::from_bits(0x3FF0_0000_0000_0001); // 1.0 + ULP at double + ctx.fpr[1] = v; + ctx.fpscr = 0x1; // RN = 01 → toward zero + // frsp f3, f1 (opcode 63, subop 12) + let raw = (63u32 << 26) | (3 << 21) | (1 << 11) | (12 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // Result rounded toward zero should be exactly 1.0_f64 + assert_eq!(ctx.fpr[3], 1.0_f64); + } + + #[test] + fn fcmpu_sets_so_on_nan_and_fprf_unordered() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.fpr[1] = f64::NAN; + ctx.fpr[2] = 1.0; + // fcmpu crfD=4, f1, f2 : (63<<26) | (crfd<<23) | (ra<<16) | (rb<<11) | (0<<1) + let raw = (63u32 << 26) | (4 << 23) | (1 << 16) | (2 << 11); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert!(ctx.cr[4].so, "unordered → SO set"); + assert!(!ctx.cr[4].lt && !ctx.cr[4].gt && !ctx.cr[4].eq); + // FPRF unordered = 0b0_0001 + let fprf = ((ctx.fpscr & fpscr::FPRF_MASK) >> 12) as u8; + assert_eq!(fprf, 0b0_0001); + } + + #[test] + fn fcmpo_on_qnan_sets_vxvc() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.fpr[1] = f64::NAN; // QNaN (Rust's NAN) + ctx.fpr[2] = 1.0; + // fcmpo (opcode 63, subop 32) + let raw = (63u32 << 26) | (4 << 23) | (1 << 16) | (2 << 11) | (32 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_ne!(ctx.fpscr & fpscr::VXVC, 0); + } + + // ---------- Phase 2i: VMX NaN propagation ---------- + + #[test] + fn vmaxfp_propagates_nan() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + let mut a = [1.0f32, 2.0, 3.0, 4.0]; + let b = [5.0f32, 6.0, 7.0, 8.0]; + a[1] = f32::NAN; + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array(a); + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array(b); + // vmaxfp vD=4, vA=2, vB=3 (opcode 4, XO=1034) + let raw = (4u32 << 26) | (4 << 21) | (2 << 16) | (3 << 11) | 1034; + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + let r = ctx.vr[4].as_f32x4(); + // lane 1 must be NaN, not 6.0 (the b side) + assert!(r[1].is_nan()); + // Other lanes should pick the max correctly + assert_eq!(r[0], 5.0); + assert_eq!(r[2], 7.0); + assert_eq!(r[3], 8.0); + } + + #[test] + fn vminfp_propagates_nan() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + let a = [1.0f32, 2.0, 3.0, 4.0]; + let mut b = [5.0f32, 6.0, 7.0, 8.0]; + b[2] = f32::NAN; + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array(a); + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array(b); + // vminfp XO=1098 + let raw = (4u32 << 26) | (4 << 21) | (2 << 16) | (3 << 11) | 1098; + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + let r = ctx.vr[4].as_f32x4(); + assert!(r[2].is_nan()); + } + + // ---------- Phase 2j: VMX denorm flush ---------- + + #[test] + fn vmaddfp_flushes_denormal_inputs() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // Smallest positive denormal f32 = f32::from_bits(1) + let denorm = f32::from_bits(1); + let a = [denorm; 4]; + let b = [0.0f32; 4]; + let c = [1.0f32; 4]; + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array(a); + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array(b); + ctx.vr[4] = xenia_types::Vec128::from_f32x4_array(c); + // vmaddfp vD=5, vA=2, vB=3, vC=4 (A-form: opcode 4, XO=46, vC at rc field) + // layout: (4<<26) | (5<<21) | (2<<16) | (3<<11) | (4<<6) | 46 + let raw = (4u32 << 26) | (5 << 21) | (2 << 16) | (3 << 11) | (4 << 6) | 46; + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + let r = ctx.vr[5].as_f32x4(); + // denorm*1 + 0 should be flushed: denorm→0, so result is 0. + assert_eq!(r, [0.0f32; 4]); + } + + /// VMX128 variant `vmaddfp128 vD, vA, vB` (primary op 5, key2 = 0b001101) + /// reuses vD as the accumulator: `vD <- (vA * vB) + vD`. Canary + /// `ppc_emit_altivec.cc:786-810` flushes *all three* inputs + /// unconditionally before the fused multiply-add — the 128-bit form + /// must match the scalar `vmaddfp` behaviour. Prior to this fix the + /// interpreter skipped the flush, leaving subnormal noise in math- + /// heavy game code. + #[test] + fn vmaddfp128_flushes_denormal_inputs() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + let denorm = f32::from_bits(1); + // vA=v2 carries denorms, which is also vD's accumulator input. + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); + // vB=v3 = 1.0 — denormal input survives only if not flushed. + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]); + // vmaddfp128 vD=v2, vA=v2, vB=v3: low 5 bits 00010 shared + // between vA and vD, vB=3 at PPC bits 16-20, key2=0b001101. + let raw: u32 = 0x1440_18D0; + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // Without flush: denorm*1.0 + denorm = 2*denorm ≠ 0. + // With flush: 0*0 + 0 = 0. + assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]); + } + + /// VMX128 `vnmsubfp128 vD, vA, vB` (key2 = 0b010101). Canary + /// `ppc_emit_altivec.cc:1133-1160` flushes all three inputs in the + /// helper. Semantics: `vD <- -((vA * vB) - vD) = vD - vA*vB`. + #[test] + fn vnmsubfp128_flushes_denormal_inputs() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + let denorm = f32::from_bits(1); + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm; 4]); + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32; 4]); + // vnmsubfp128 vD=v2, vA=v2, vB=v3: key2 = 0b010101 (21) encoded + // via bits 22-25 = 0101 and bit 27 = 1. + let raw: u32 = 0x1440_1950; + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // Without flush: denorm - denorm*1.0 = 0 (but the intermediate + // values propagate subnormals through the compute); with flush + // everything is 0 cleanly. + assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]); + } + + /// VMX128 `vmsum4fp128 vD, vA, vB` computes the 4-lane dot product + /// and broadcasts the result. Canary + /// `ppc_emit_altivec.cc:1077-1084` flushes the *output* denormal + /// (not the inputs). A dot product that sums to a subnormal must + /// read back as 0. + #[test] + fn vmsum4fp128_flushes_denormal_output() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + let denorm = f32::from_bits(1); + // Dot product = denorm * 1.0 + 0 + 0 + 0 = denorm. + ctx.vr[2] = xenia_types::Vec128::from_f32x4_array([denorm, 0.0, 0.0, 0.0]); + ctx.vr[3] = xenia_types::Vec128::from_f32x4_array([1.0f32, 0.0, 0.0, 0.0]); + // vmsum4fp128 vD=v2, vA=v2, vB=v3: key2 = 0b011101 (29). + let raw: u32 = 0x1440_19D0; + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // Subnormal output must flush to 0 and broadcast across all lanes. + assert_eq!(ctx.vr[2].as_f32x4(), [0.0f32; 4]); + } + + // ---------- Phase 2k: lve*x / stve*x element masking ---------- + + #[test] + fn lvebx_loads_byte_into_ea_slot() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + mem.write_u8(0x1003, 0xAB); + ctx.gpr[4] = 0x1003; + ctx.gpr[5] = 0; + // lvebx v1, r4, r5 : (31<<26) | (1<<21) | (4<<16) | (5<<11) | (7<<1) + let raw = (31u32 << 26) | (1 << 21) | (4 << 16) | (5 << 11) | (7 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + let bytes = ctx.vr[1].as_bytes(); + // Byte at slot 3 (EA & 0xF = 3) + assert_eq!(bytes[3], 0xAB); + // Other bytes zero + for i in 0..16 { + if i != 3 { assert_eq!(bytes[i], 0, "byte {} should be zero", i); } + } + } + + #[test] + fn stvewx_stores_only_word_slot() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // Prepare surrounding memory with a sentinel value so we can check non-overwrite. + for i in 0..16 { + mem.write_u8(0x1000 + i, 0x55); + } + // vS lanes: (big-endian view) word0=0xDEADBEEF, word1..3=0. + let mut src = [0u8; 16]; + src[0] = 0xDE; src[1] = 0xAD; src[2] = 0xBE; src[3] = 0xEF; + ctx.vr[1] = xenia_types::Vec128::from_bytes(src); + // EA = 0x1000 (slot 0): store word0 at 0x1000. + ctx.gpr[4] = 0x1000; + ctx.gpr[5] = 0; + // stvewx v1, r4, r5 : (31<<26) | (1<<21) | (4<<16) | (5<<11) | (199<<1) + let raw = (31u32 << 26) | (1 << 21) | (4 << 16) | (5 << 11) | (199 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(mem.read_u32(0x1000), 0xDEAD_BEEF); + // Adjacent bytes untouched (still 0x55). + for i in 4..16 { + assert_eq!(mem.data[0x1000 + i as usize].get(), 0x55, "byte {} was overwritten", 0x1000+i); + } + } + + // ---------- Phase 2l: reservation cache-line granule ---------- + + #[test] + fn stwcx_succeeds_within_same_cache_line() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + mem.write_u32(0x1004, 0xAAAA_AAAA); + ctx.gpr[4] = 0x1004; + ctx.gpr[5] = 0; + // lwarx r3, r4, r5 : (31<<26)|(3<<21)|(4<<16)|(5<<11)|(20<<1) + let ld = (31u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1); + write_instr(&mut mem, 0, ld); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert!(ctx.has_reservation); + // Now stwcx. to a DIFFERENT address in the SAME cache line (offset within 128 bytes). + ctx.gpr[4] = 0x1008; // 4 bytes over; same line. + ctx.gpr[6] = 0xBBBB_BBBB; + let st = (31u32 << 26) | (6 << 21) | (4 << 16) | (5 << 11) | (150 << 1) | 1; + write_instr(&mut mem, 4, st); + step(&mut ctx, &mut mem); + // Matches cache line → succeeds. + assert!(ctx.cr[0].eq); + assert_eq!(mem.read_u32(0x1008), 0xBBBB_BBBB); + } + + #[test] + fn stwcx_fails_across_cache_lines() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.gpr[4] = 0x1000; + ctx.gpr[5] = 0; + let ld = (31u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1); + write_instr(&mut mem, 0, ld); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // Different cache line (0x1080). + ctx.gpr[4] = 0x1080; + ctx.gpr[6] = 0xCCCC_CCCC; + let st = (31u32 << 26) | (6 << 21) | (4 << 16) | (5 << 11) | (150 << 1) | 1; + write_instr(&mut mem, 4, st); + step(&mut ctx, &mut mem); + assert!(!ctx.cr[0].eq, "should fail across cache line"); + assert_eq!(mem.read_u32(0x1080), 0, "memory not written on failure"); + } + + // ---------- Phase 2m: SPR DEC + TBL/TBU write ---------- + + #[test] + fn mfspr_dec_returns_dec_field() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.dec = 0x1234_5678; + // mfspr r3, DEC (22). SPR encoded with halves swapped: (22 & 0x1F)<<5 | (22>>5)&0x1F = 0x2C0 in bits 11..20. + // The decoder does the un-swap, so the raw SPR field stores the swapped form. + let spr_swapped = ((22u32 & 0x1F) << 5) | ((22u32 >> 5) & 0x1F); + let raw = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (339 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + assert_eq!(ctx.gpr[3], 0x1234_5678); + } + + #[test] + fn mtspr_tbl_write_updates_low_half() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + ctx.timebase = 0xAAAA_BBBB_CCCC_DDDD; + ctx.gpr[3] = 0x1111_2222; + // mtspr TBL_WRITE (284), r3 + let spr_swapped = ((284u32 & 0x1F) << 5) | ((284u32 >> 5) & 0x1F); + let raw = (31u32 << 26) | (3 << 21) | (spr_swapped << 11) | (467 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // step() post-increments timebase by 1, so the observable low half is value+1. + assert_eq!(ctx.timebase & 0xFFFF_FFFF, 0x1111_2222u64 + 1); + assert_eq!(ctx.timebase >> 32, 0xAAAA_BBBB); + } + + // ---------- Block-cache parity tests ---------- + // + // These confirm that running a program through the basic-block + // cache (crate::block_cache::BlockCache + step_block) produces a + // bit-identical PpcContext to running it through step_cached + // (per-instruction). If this ever fails the block cache is not + // safe to engage in production. + + fn enc_addi_t(rd: u32, ra: u32, simm: i16) -> u32 { + (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32) + } + fn enc_lwz_t(rd: u32, ra: u32, d: i16) -> u32 { + (32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32) + } + fn enc_stw_t(rs: u32, ra: u32, d: i16) -> u32 { + (36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32) + } + fn enc_b_t(li_words: i32) -> u32 { + // Branch: opcode 18, AA=0, LK=0, LI = li_words << 2 (signed). + let li = (li_words as u32) & 0x00FF_FFFF; + (18u32 << 26) | (li << 2) + } + + /// Snapshot of the parts of `PpcContext` that block_matches_per_instr + /// is asked to keep identical between dispatch paths. Comparing the + /// whole struct is impractical (vector regs, fp regs, large arrays); + /// the GPR file + pc + lr + cr + cycle counters cover everything the + /// interpreter touches in the test programs below. + #[derive(Debug, PartialEq, Eq)] + struct CtxSnap { + gpr: [u64; 32], + pc: u32, + lr: u64, + ctr: u64, + cycle_count: u64, + cr_packed: u32, + } + impl CtxSnap { + fn from(ctx: &PpcContext) -> Self { + Self { + gpr: ctx.gpr, + pc: ctx.pc, + lr: ctx.lr, + ctr: ctx.ctr, + cycle_count: ctx.cycle_count, + cr_packed: ctx.cr(), + } + } + } + + fn run_per_instruction(prog: &[u32], iters: u32, init_gpr: &[(usize, u64)]) -> CtxSnap { + let mut ctx = PpcContext::new(); + for &(i, v) in init_gpr { + ctx.gpr[i] = v; + } + let mut mem = TestMem::new(); + for (i, &raw) in prog.iter().enumerate() { + write_instr(&mut mem, (i as u32) * 4, raw); + } + let mut cache = crate::decoder::DecodeCache::new(); + ctx.pc = 0; + for _ in 0..iters { + // Run one instruction at a time. Memory has constant + // page_version (default trait impl returns 1) so the cache + // entries stay valid forever. + let r = step_cached(&mut ctx, &mut mem, &mut cache, 1); + assert!(matches!(r, StepResult::Continue)); + } + CtxSnap::from(&ctx) + } + + fn run_block(prog: &[u32], iters: u32, init_gpr: &[(usize, u64)]) -> CtxSnap { + let mut ctx = PpcContext::new(); + for &(i, v) in init_gpr { + ctx.gpr[i] = v; + } + let mut mem = TestMem::new(); + for (i, &raw) in prog.iter().enumerate() { + write_instr(&mut mem, (i as u32) * 4, raw); + } + let mut bc = crate::block_cache::BlockCache::new(); + ctx.pc = 0; + let mut total_steps = 0u32; + // Iterate by *blocks* until we've covered at least `iters` + // instructions. The block path runs N instructions per call + // where N is the block length; we still want to compare on a + // per-instruction footing, so accumulate cycle_count. + while total_steps < iters { + // Borrow bc only long enough to copy the slice we need — + // step_block needs &mut MemoryAccess so we can't hold a + // shared borrow on bc across the call. + let block_ptr: *const crate::block_cache::DecodedBlock = { + let b: &crate::block_cache::DecodedBlock = bc.lookup_or_build(ctx.pc, &mem); + b + }; + // Safety: the BlockCache::lookup_or_build contract is that + // the returned reference stays valid until the next + // lookup_or_build on the same cache. We don't call + // lookup_or_build inside step_block and we drop the raw + // pointer at the end of the iteration, so no aliasing. + let block: &crate::block_cache::DecodedBlock = unsafe { &*block_ptr }; + let n_before = ctx.cycle_count; + let r = step_block(&mut ctx, &mut mem, block); + assert!(matches!(r, StepResult::Continue)); + let stepped = (ctx.cycle_count - n_before) as u32; + total_steps += stepped; + } + CtxSnap::from(&ctx) + } + + #[test] + fn block_dispatch_matches_per_instruction_alu_loop() { + // 4-instruction loop: r3 += 1, r3 += 2, r3 += 3, b -12 (back to start). + let prog = [ + enc_addi_t(3, 3, 1), + enc_addi_t(3, 3, 2), + enc_addi_t(3, 3, 3), + enc_b_t(-3), // -3 words → back to instr 0 + ]; + let init = [(3usize, 0u64)]; + let snap_a = run_per_instruction(&prog, 100, &init); + let snap_b = run_block(&prog, 100, &init); + assert_eq!(snap_a, snap_b); + } + + #[test] + fn block_dispatch_matches_per_instruction_loadstore_loop() { + // r4 = 0x800 (data pointer), r3 = 1 + // loop: + // stw r3, 0(r4) + // lwz r5, 0(r4) + // addi r3, r5, 1 + // b -12 + let prog = [ + enc_stw_t(3, 4, 0), + enc_lwz_t(5, 4, 0), + enc_addi_t(3, 5, 1), + enc_b_t(-3), + ]; + let init = [(3usize, 1u64), (4usize, 0x800u64)]; + let snap_a = run_per_instruction(&prog, 200, &init); + let snap_b = run_block(&prog, 200, &init); + assert_eq!(snap_a, snap_b); + } + + #[test] + fn mcrfs_moves_fpscr_nibble_and_clears_exception_bits() { + let mut ctx = PpcContext::new(); + let mut mem = TestMem::new(); + // Set FPSCR bit 0 (FX) = 1 and bit 3 (OX) = 1. In our layout: + // FX at (31-0) = 31 + // OX at (31-3) = 28 + ctx.fpscr = (1u32 << 31) | (1u32 << 28); + // mcrfs crfD=2, crfS=0: (63 << 26) | (crfD<<23) | (crfS<<18) | (64<<1) + let raw = (63 << 26) | (2 << 23) | (0 << 18) | (64 << 1); + write_instr(&mut mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mut mem); + // FPSCR bits 0..3 of crfS=0 → ctx.cr[2] should have FX(lt)=1 and OX(so)=0 + // and the FEX/VX nibble positions are bits 1,2 + // Nibble contents: FX=1, FEX=0, VX=0, OX=1 → 0b1001 = 9 + assert_eq!(ctx.cr[2].as_u8(), 0b1001); + // FX and OX are clearable → FPSCR now has those nibble bits cleared + assert_eq!(ctx.fpscr & (1 << 31), 0, "FX cleared"); + assert_eq!(ctx.fpscr & (1 << 28), 0, "OX cleared"); + } + + /// Regression: `subfze` is `RT ← !RA + CA` (no -1 term), so 64-bit + /// carry-out only happens when `RA == 0 && CA == 1`. The previous + /// predicate (`!ra != 0 || ca != 0`) was copy-pasted from `subfme` + /// and reported CA=1 in nearly every case. + #[test] + fn test_subfze_carry_only_when_ra_zero_and_ca_one() { + // subfze rD, rA: opcode 31, XO=200 (bits 22-30), OE=0, Rc=0. + // Encoding: (31<<26) | (rd<<21) | (ra<<16) | (200<<1) + let raw = (31u32 << 26) | (3 << 21) | (4 << 16) | (200 << 1); + + // Case 1: ra=0, ca=1 → CA=1 (the only carry case) + { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + write_instr(&mem, 0, raw); + ctx.pc = 0; + ctx.gpr[4] = 0; + ctx.xer_ca = 1; + step(&mut ctx, &mem); + assert_eq!(ctx.xer_ca, 1, "ra=0, ca=1 should produce CA=1"); + assert_eq!(ctx.gpr[3], 0, "result = !0 + 1 = 0 (wraps)"); + } + // Case 2: ra=0, ca=0 → CA=0 (old buggy code reported CA=1) + { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + write_instr(&mem, 0, raw); + ctx.pc = 0; + ctx.gpr[4] = 0; + ctx.xer_ca = 0; + step(&mut ctx, &mem); + assert_eq!(ctx.xer_ca, 0, "ra=0, ca=0 should produce CA=0"); + assert_eq!(ctx.gpr[3], u64::MAX, "result = !0 + 0 = u64::MAX"); + } + // Case 3: ra=1, ca=0 → CA=0 (old buggy code reported CA=1) + { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + write_instr(&mem, 0, raw); + ctx.pc = 0; + ctx.gpr[4] = 1; + ctx.xer_ca = 0; + step(&mut ctx, &mem); + assert_eq!(ctx.xer_ca, 0, "ra=1, ca=0 should produce CA=0"); + assert_eq!(ctx.gpr[3], u64::MAX - 1, "result = !1 + 0 = u64::MAX - 1"); + } + // Case 4: ra=u64::MAX, ca=0 → CA=0 (old buggy code reported CA=1 + // because !ra == 0 only here, which the buggy `!ra != 0` predicate + // happened to handle right; flip ca=1 to exercise the other arm) + { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + write_instr(&mem, 0, raw); + ctx.pc = 0; + ctx.gpr[4] = u64::MAX; + ctx.xer_ca = 1; + step(&mut ctx, &mem); + assert_eq!(ctx.xer_ca, 0, "ra=u64::MAX, ca=1 should produce CA=0"); + assert_eq!(ctx.gpr[3], 1, "result = !u64::MAX + 1 = 1"); + } + } + + /// Regression: `cmp` (L=1) must not derive LT/GT from the sign of a + /// (potentially overflowing) 64-bit subtract. The old code used + /// `update_cr_signed(bf, ra.wrapping_sub(rb))` which mis-signed the + /// result for boundary i64 values like `ra=i64::MIN, rb=1`. + #[test] + fn test_cmp_signed_at_i64_boundaries() { + // cmp BF=0, L=1, RA, RB: (31<<26) | (1<<21) | (ra<<16) | (rb<<11) + // (XO=0; Rc field is reserved on cmp, leave 0) + let raw = |ra: u32, rb: u32| (31u32 << 26) | (1 << 21) | (ra << 16) | (rb << 11); + + // i64::MIN < 1 → LT must be set + { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + write_instr(&mem, 0, raw(3, 4)); + ctx.pc = 0; + ctx.gpr[3] = i64::MIN as u64; + ctx.gpr[4] = 1; + step(&mut ctx, &mem); + assert!(ctx.cr[0].lt, "i64::MIN < 1 must be LT"); + assert!(!ctx.cr[0].gt); + assert!(!ctx.cr[0].eq); + } + // i64::MAX > -1 → GT must be set (the symmetric overflow corner) + { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + write_instr(&mem, 0, raw(3, 4)); + ctx.pc = 0; + ctx.gpr[3] = i64::MAX as u64; + ctx.gpr[4] = (-1i64) as u64; + step(&mut ctx, &mem); + assert!(!ctx.cr[0].lt); + assert!(ctx.cr[0].gt, "i64::MAX > -1 must be GT"); + assert!(!ctx.cr[0].eq); + } + // Equal at the extreme is still EQ + { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + write_instr(&mem, 0, raw(3, 4)); + ctx.pc = 0; + ctx.gpr[3] = i64::MIN as u64; + ctx.gpr[4] = i64::MIN as u64; + step(&mut ctx, &mem); + assert!(!ctx.cr[0].lt); + assert!(!ctx.cr[0].gt); + assert!(ctx.cr[0].eq, "i64::MIN == i64::MIN must be EQ"); + } + } + + /// Regression: `lvebx` must preserve the prior contents of the + /// destination VR for lanes other than the loaded byte. Previously + /// the handler started from a zeroed buffer. + #[test] + fn test_lvebx_preserves_other_lanes() { + let mut ctx = PpcContext::new(); + let mem = TestMem::new(); + // Pre-seed vr[3] with a recognizable pattern. + let pattern: [u8; 16] = [ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + ]; + ctx.vr[3] = xenia_types::Vec128::from_bytes(pattern); + // Place a byte at memory address 0x1004; EA & 0xF == 4 → slot 4. + mem.write_u8(0x1004, 0xAB); + // r4 = 0x1000, r5 = 4 → EA = 0x1004 + ctx.gpr[4] = 0x1000; + ctx.gpr[5] = 4; + // lvebx vD=3, rA=4, rB=5: opcode 31, XO=7 → (31<<26)|(3<<21)|(4<<16)|(5<<11)|(7<<1) + let raw = (31u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (7 << 1); + write_instr(&mem, 0, raw); + ctx.pc = 0; + step(&mut ctx, &mem); + // Expected: lane 4 holds 0xAB, every other lane unchanged. + let mut expected = pattern; + expected[4] = 0xAB; + assert_eq!(ctx.vr[3].as_bytes(), expected); + } } diff --git a/crates/xenia-cpu/src/lib.rs b/crates/xenia-cpu/src/lib.rs index b84cb73..576a79e 100644 --- a/crates/xenia-cpu/src/lib.rs +++ b/crates/xenia-cpu/src/lib.rs @@ -1,9 +1,25 @@ +pub mod block_cache; pub mod context; pub mod decoder; pub mod disasm; +pub mod fpscr; pub mod interpreter; pub mod opcode; +pub mod overflow; +pub mod phaser; +pub mod reservation; +pub mod scheduler; +pub mod trap; +pub mod vmx; pub use context::PpcContext; pub use decoder::decode; +pub use disasm::{DisasmItem, DisasmText, disassemble, format as disasm_format, iter_disasm}; pub use opcode::PpcOpcode; +pub use phaser::{Phaser, PhaserOutcome}; +pub use reservation::ReservationTable; +pub use scheduler::{ + BlockReason, GuestThread, HwSlot, HwState, MigrationFixup, OrderMode, PcrWriter, RoundOutcome, + Scheduler, SpawnError, SpawnParams, ThreadRef, HW_THREAD_COUNT, INITIAL_GUEST_TID, + QUANTUM_DEFAULT, +}; diff --git a/crates/xenia-cpu/src/opcode.rs b/crates/xenia-cpu/src/opcode.rs index 01fb77c..e3361ec 100644 --- a/crates/xenia-cpu/src/opcode.rs +++ b/crates/xenia-cpu/src/opcode.rs @@ -145,6 +145,33 @@ impl PpcOpcode { matches!(self, Self::sc) } + /// Returns true if this opcode unconditionally ends a basic block: + /// any branch, system call, trap, or `Invalid` (decoder couldn't + /// recognize the instruction — execution will hit the + /// `Unimplemented` arm and we don't want to swallow the boundary + /// inside a cached block). + /// + /// Notably *not* terminating: `mtmsr`/`mtmsrd`/`isync`/`mfmsr`. + /// On real hardware these have synchronization semantics (a context + /// synchronizing event for `isync`, MSR rewrite for the `mt*`s) but + /// our interpreter has no asynchronous-exception model and no + /// out-of-order execution — they execute as plain ALU/move ops and + /// don't change control flow synchronously. Block-cache replay is + /// still bit-for-bit identical to per-instruction dispatch for + /// those. + /// + /// Used by the basic-block cache (`block_cache.rs`) to know when to + /// stop accumulating instructions during a forward decode walk. + pub fn terminates_block(&self) -> bool { + matches!( + self, + Self::bx | Self::bcx | Self::bclrx | Self::bcctrx + | Self::sc + | Self::td | Self::tdi | Self::tw | Self::twi + | Self::Invalid + ) + } + /// Returns true if this is a load instruction. pub fn is_load(&self) -> bool { matches!(self, @@ -194,3 +221,60 @@ impl std::fmt::Display for PpcOpcode { std::fmt::Debug::fmt(self, f) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn terminates_block_includes_all_branches() { + assert!(PpcOpcode::bx.terminates_block()); + assert!(PpcOpcode::bcx.terminates_block()); + assert!(PpcOpcode::bclrx.terminates_block()); + assert!(PpcOpcode::bcctrx.terminates_block()); + } + + #[test] + fn terminates_block_includes_sc_and_traps() { + assert!(PpcOpcode::sc.terminates_block()); + assert!(PpcOpcode::td.terminates_block()); + assert!(PpcOpcode::tdi.terminates_block()); + assert!(PpcOpcode::tw.terminates_block()); + assert!(PpcOpcode::twi.terminates_block()); + } + + #[test] + fn terminates_block_includes_invalid() { + // Decoder failure must end the block — otherwise an unknown + // opcode would be replayed inside a cached block without going + // through the per-instruction Unimplemented path. + assert!(PpcOpcode::Invalid.terminates_block()); + } + + #[test] + fn terminates_block_excludes_straight_line_ops() { + // Common ALU and load/store ops must NOT terminate a block. + assert!(!PpcOpcode::addi.terminates_block()); + assert!(!PpcOpcode::addis.terminates_block()); + assert!(!PpcOpcode::addx.terminates_block()); + assert!(!PpcOpcode::cmpi.terminates_block()); + assert!(!PpcOpcode::cmp.terminates_block()); + assert!(!PpcOpcode::lwz.terminates_block()); + assert!(!PpcOpcode::stw.terminates_block()); + assert!(!PpcOpcode::lbzx.terminates_block()); + assert!(!PpcOpcode::ori.terminates_block()); + assert!(!PpcOpcode::oris.terminates_block()); + assert!(!PpcOpcode::rlwinmx.terminates_block()); + } + + #[test] + fn terminates_block_excludes_msr_and_sync_ops() { + // Documented decision: synchronizing ops execute as ALU within + // a block since the interpreter has no async-exception model. + assert!(!PpcOpcode::mtmsr.terminates_block()); + assert!(!PpcOpcode::mtmsrd.terminates_block()); + assert!(!PpcOpcode::isync.terminates_block()); + assert!(!PpcOpcode::sync.terminates_block()); + assert!(!PpcOpcode::mfmsr.terminates_block()); + } +} diff --git a/crates/xenia-cpu/src/overflow.rs b/crates/xenia-cpu/src/overflow.rs new file mode 100644 index 0000000..a55f505 --- /dev/null +++ b/crates/xenia-cpu/src/overflow.rs @@ -0,0 +1,173 @@ +//! OE / XER[OV] / XER[SO] handling for integer arithmetic. +//! +//! PPC integer ops with the OE bit set update XER[OV] (overflow) and sticky-set +//! XER[SO]. When OE is clear the instruction leaves XER untouched. Signed +//! overflow is predicated on the operation width and operand signs per the +//! PowerISA pseudocode. For 32-bit-word operations (`addw`, `mullw`, `divw`, +//! `neg`, etc. — on PPC these all have `w` in the mnemonic in spec +//! descriptions even when the assembler spells them without) the predicate +//! uses the low 32 bits. For 64-bit operations (`add`, `mulld`, `divd`) the +//! predicate uses the full 64 bits. + +use crate::context::PpcContext; + +#[inline] +pub fn apply(ctx: &mut PpcContext, overflowed: bool) { + if overflowed { + ctx.xer_ov = 1; + ctx.xer_so = 1; + } else { + ctx.xer_ov = 0; + } +} + +/// Signed addition overflow at width-64 (plain `add`, `addc`, `subf`, `subfc`). +/// +/// Predicate: same-sign inputs with opposite-sign result. +/// For sub callers, rewrite as `a + b'` first (see `_sub`). +#[inline] +pub fn add_ov_64(a: u64, b: u64, result: u64) -> bool { + ((!(a ^ b)) & (a ^ result)) >> 63 != 0 +} + +/// Universal signed-overflow predicate for 64-bit arithmetic. +/// +/// Caller computes the mathematical (infinite-precision) signed sum as i128, +/// plus the stored 64-bit result. Overflow iff the two disagree — i.e. the +/// true value doesn't fit in i64. +/// +/// Use this for multi-term chains (`adde`, `addme`, `addze`, `subfe`, `subfme`, +/// `subfze`) where the carry-in makes the bit-predicate above awkward. +#[inline] +pub fn sum_overflow_64(true_sum: i128, result: u64) -> bool { + true_sum != (result as i64) as i128 +} + +/// Signed subtraction: RT = b - a. Overflow iff opposite-sign inputs with +/// result sign != b's sign. Equivalently, reduce to addition with `!a + 1`. +#[inline] +pub fn sub_ov_64(a: u64, b: u64, result: u64) -> bool { + ((a ^ b) & (b ^ result)) >> 63 != 0 +} + +/// Signed `addc`/`adde` chain overflow. Same rule as `add_ov_64` — the carry +/// in doesn't alter the sign predicate directly because it's already folded +/// into the stored result. +#[inline] +pub fn adde_ov_64(a: u64, b: u64, result: u64) -> bool { + add_ov_64(a, b, result) +} + +/// Signed 32-bit multiply overflow (`mullwo`): result fits in 32 bits signed +/// iff bit 32 equals bits 33..63 of the 64-bit product. +#[inline] +pub fn mullw_ov(product: i64) -> bool { + let lo = product as i32 as i64; + lo != product +} + +/// Signed 64-bit multiply overflow (`mulldo`). Detected via checked_mul. +#[inline] +pub fn mulld_ov(a: i64, b: i64) -> bool { + a.checked_mul(b).is_none() +} + +/// `divwo` / `divwuo` / `divdo` / `divduo` raise OV in two cases: +/// * divisor is zero, or +/// * signed division of `INT_MIN / -1` (quotient doesn't fit). +#[inline] +pub fn divw_ov_signed(ra: i32, rb: i32) -> bool { + rb == 0 || (ra == i32::MIN && rb == -1) +} + +#[inline] +pub fn divw_ov_unsigned(rb: u32) -> bool { + rb == 0 +} + +#[inline] +pub fn divd_ov_signed(ra: i64, rb: i64) -> bool { + rb == 0 || (ra == i64::MIN && rb == -1) +} + +#[inline] +pub fn divd_ov_unsigned(rb: u64) -> bool { + rb == 0 +} + +/// `negx`: RT = -(RA). Overflow only when RA = INT_MIN (the negation doesn't fit). +#[inline] +pub fn neg_ov_64(ra: u64) -> bool { + ra == 0x8000_0000_0000_0000 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn add_no_overflow() { + assert!(!add_ov_64(1, 2, 3)); + assert!(!add_ov_64(u64::MAX, 0, u64::MAX)); + } + + #[test] + fn add_positive_overflow() { + // INT64_MAX + 1 = INT64_MIN — signed overflow + let a = i64::MAX as u64; + let b = 1u64; + let r = a.wrapping_add(b); + assert!(add_ov_64(a, b, r)); + } + + #[test] + fn add_negative_overflow() { + // INT64_MIN + -1 = INT64_MAX — signed overflow + let a = i64::MIN as u64; + let b = (-1i64) as u64; + let r = a.wrapping_add(b); + assert!(add_ov_64(a, b, r)); + } + + #[test] + fn sub_overflow_min_minus_pos() { + // INT64_MIN - 1 overflows + let b = i64::MIN as u64; + let a = 1u64; + let r = b.wrapping_sub(a); + assert!(sub_ov_64(a, b, r)); + } + + #[test] + fn sub_no_overflow() { + let b = 5u64; + let a = 2u64; + let r = b.wrapping_sub(a); + assert!(!sub_ov_64(a, b, r)); + } + + #[test] + fn mullw_fits_32_bits() { + assert!(!mullw_ov((i32::MAX as i64) * 1)); + assert!(!mullw_ov(-1i64)); + } + + #[test] + fn mullw_overflows_32_bits() { + let p = (i32::MAX as i64) * 2; + assert!(mullw_ov(p)); + } + + #[test] + fn mulld_overflows() { + assert!(mulld_ov(i64::MAX, 2)); + assert!(!mulld_ov(i64::MAX, 1)); + } + + #[test] + fn neg_ov_only_at_min() { + assert!(neg_ov_64(i64::MIN as u64)); + assert!(!neg_ov_64(0)); + assert!(!neg_ov_64(1)); + } +} diff --git a/crates/xenia-cpu/src/phaser.rs b/crates/xenia-cpu/src/phaser.rs new file mode 100644 index 0000000..e9d2fcf --- /dev/null +++ b/crates/xenia-cpu/src/phaser.rs @@ -0,0 +1,345 @@ +//! Quantum-boundary phaser for the M3 per-HW-thread parallel scheduler. +//! +//! Six [`super::HW_THREAD_COUNT`] host threads run their slots' interpreters +//! in parallel, then meet at a phaser to advance to the next quantum. This +//! is **not** [`std::sync::Barrier`]: a Barrier needs a fixed party count, +//! but our slots can become idle (no runnable thread) and shouldn't block +//! the phaser arrival. +//! +//! ## Semantics +//! +//! - Each slot at the end of its quantum either calls +//! [`Phaser::arrive_and_wait`] (it has a runnable thread to run next +//! quantum) or [`Phaser::skip`] (it's idle this round and will wake on +//! `slot_wake[i]`). +//! - The phase advances when **all 6 slots have either arrived or +//! skipped**. Arrived slots block until the advance; skipped slots +//! return immediately and re-poll their wake state. +//! - The phaser uses a generation counter so a slot that arrives "early" +//! in the next phase doesn't see the prior phase's "all arrived" +//! condition. +//! - Defensive timeout: [`Phaser::arrive_and_wait_timeout`] returns +//! [`PhaserOutcome::Timeout`] if a peer crashes / hangs. Callers +//! typically convert this into a graceful shutdown rather than +//! panicking, so the rest of the topology can tear down cleanly. +//! +//! ## Memory ordering +//! +//! - The participant counter (`arrived` + `skipped`) uses `AcqRel` on +//! the increment so the last-to-arrive thread sees a consistent +//! "everyone is here" snapshot. +//! - The generation `phase` is read with `Acquire` in arrivers' wait +//! loops; the advancing thread stores with `Release` after bumping. +//! - The condvar's broadcast publishes the phase; the wait loop +//! re-checks `phase` against its captured value to defend against +//! spurious wakeups. + +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Condvar, Mutex}; +use std::time::{Duration, Instant}; + +/// Outcome of a phaser arrival. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PhaserOutcome { + /// All participants arrived/skipped — phase advanced. Caller proceeds + /// into the next quantum. + Advanced, + /// Defensive timeout fired before all peers arrived. Caller should + /// log + initiate shutdown rather than retry. + Timeout, + /// Phaser was shut down via [`Phaser::shutdown`]; all waiters are + /// woken and return this. Caller exits cleanly. + Shutdown, +} + +/// Custom barrier-with-skip primitive. Construct once with the number of +/// participating slots; share via `Arc` across host threads. +pub struct Phaser { + /// Total participant count (constant after construction). For our + /// scheduler this is `HW_THREAD_COUNT = 6`. + party_count: u32, + /// Monotonic phase counter, incremented every time the phase + /// advances. Used as a generation marker so a slot that wakes "into" + /// the next phase doesn't observe the old "everyone arrived" state. + phase: AtomicU32, + /// Inner state guarded by the condvar's mutex. + inner: Mutex, + /// Notified when a phase advances or shutdown fires. + cv: Condvar, +} + +#[derive(Debug)] +struct Inner { + arrived_or_skipped: u32, + shutdown: bool, +} + +impl Phaser { + /// Create a phaser with `party_count` participants. Panics if + /// `party_count == 0`. + pub fn new(party_count: u32) -> Self { + assert!(party_count > 0, "phaser party_count must be > 0"); + Self { + party_count, + phase: AtomicU32::new(0), + inner: Mutex::new(Inner { + arrived_or_skipped: 0, + shutdown: false, + }), + cv: Condvar::new(), + } + } + + /// Get the current phase number. Useful for tests and observability. + pub fn current_phase(&self) -> u32 { + self.phase.load(Ordering::Acquire) + } + + /// Mark this slot as not participating in the current phase. Counts + /// toward the advance threshold but does not block. Used when a slot + /// has no runnable thread and is parked waiting on + /// `slot_wake[i].unpark()`. + /// + /// `_slot_id` is informational (not stored); the parameter exists so + /// call sites stay greppable. + pub fn skip(&self, _slot_id: u8) { + self.contribute_advance(); + } + + /// Block until the phase advances or the defensive 5-second timeout + /// fires. Returns [`PhaserOutcome::Advanced`] on a clean phase + /// transition; [`Timeout`] if a peer hung; [`Shutdown`] on tear-down. + /// + /// `_slot_id` is informational (see [`Self::skip`]). + pub fn arrive_and_wait(&self, _slot_id: u8) -> PhaserOutcome { + self.arrive_and_wait_timeout(Duration::from_secs(5)) + } + + /// Same as [`Self::arrive_and_wait`] with a caller-supplied timeout. + pub fn arrive_and_wait_timeout(&self, timeout: Duration) -> PhaserOutcome { + let pre_phase = self.phase.load(Ordering::Acquire); + self.contribute_advance(); + let deadline = Instant::now() + timeout; + let mut guard = self.inner.lock().unwrap(); + loop { + if guard.shutdown { + return PhaserOutcome::Shutdown; + } + if self.phase.load(Ordering::Acquire) != pre_phase { + return PhaserOutcome::Advanced; + } + let now = Instant::now(); + if now >= deadline { + return PhaserOutcome::Timeout; + } + let remaining = deadline - now; + let result = self.cv.wait_timeout(guard, remaining).unwrap(); + guard = result.0; + if result.1.timed_out() { + // Loop once more to disambiguate "real timeout" vs + // "spurious wakeup just before the deadline". + if self.phase.load(Ordering::Acquire) != pre_phase { + return PhaserOutcome::Advanced; + } + if guard.shutdown { + return PhaserOutcome::Shutdown; + } + return PhaserOutcome::Timeout; + } + } + } + + /// Wake every parked arriver and signal shutdown. After this, all + /// future and outstanding `arrive_and_wait_*` calls return + /// [`PhaserOutcome::Shutdown`]. + pub fn shutdown(&self) { + let mut guard = self.inner.lock().unwrap(); + guard.shutdown = true; + self.cv.notify_all(); + } + + /// Common path for both arrive-and-wait and skip: bump the + /// participant counter, and if we were the last one in, advance the + /// phase + broadcast. + fn contribute_advance(&self) { + let mut guard = self.inner.lock().unwrap(); + guard.arrived_or_skipped += 1; + if guard.arrived_or_skipped >= self.party_count { + // Last one in. Reset the counter, bump the phase, broadcast. + guard.arrived_or_skipped = 0; + // `Release` on the phase store pairs with `Acquire` reads in + // arriving slots' wait-loop predicates. + self.phase.fetch_add(1, Ordering::Release); + self.cv.notify_all(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Arc; + use std::sync::atomic::AtomicU32; + use std::thread; + + /// All N participants arrive — phase advances, every arriver returns + /// `Advanced`. + #[test] + fn n_arrivers_all_advance() { + const N: u32 = 6; + let p = Arc::new(Phaser::new(N)); + let mut handles = Vec::new(); + for i in 0..N { + let p = p.clone(); + handles.push( + thread::Builder::new() + .name(format!("phaser-test-{i}")) + .spawn(move || p.arrive_and_wait(i as u8)) + .unwrap(), + ); + } + for h in handles { + assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced); + } + assert_eq!(p.current_phase(), 1); + } + + /// 5 arrive + 1 skip → phase advances; arrivers see `Advanced`. + #[test] + fn skip_counts_toward_advance() { + const N: u32 = 6; + let p = Arc::new(Phaser::new(N)); + let mut handles = Vec::new(); + for i in 0..(N - 1) { + let p = p.clone(); + handles.push( + thread::Builder::new() + .name(format!("phaser-arrive-{i}")) + .spawn(move || p.arrive_and_wait(i as u8)) + .unwrap(), + ); + } + // Brief pause to let arrivers park first (exercising the + // skip-unblocks-arrivers path). + thread::sleep(Duration::from_millis(20)); + p.skip((N - 1) as u8); + for h in handles { + assert_eq!(h.join().unwrap(), PhaserOutcome::Advanced); + } + assert_eq!(p.current_phase(), 1); + } + + /// Shutdown wakes parked arrivers; they return `Shutdown`. + #[test] + fn shutdown_wakes_arrivers() { + const N: u32 = 6; + let p = Arc::new(Phaser::new(N)); + let mut handles = Vec::new(); + // Only N-1 arrive — phase will not advance. + for i in 0..(N - 1) { + let p = p.clone(); + handles.push( + thread::Builder::new() + .name(format!("phaser-arrive-shutdown-{i}")) + .spawn(move || p.arrive_and_wait(i as u8)) + .unwrap(), + ); + } + thread::sleep(Duration::from_millis(20)); + p.shutdown(); + for h in handles { + assert_eq!(h.join().unwrap(), PhaserOutcome::Shutdown); + } + } + + /// Defensive timeout: if some peers never arrive, others surface + /// `Timeout` rather than blocking forever. + #[test] + fn timeout_fires_when_peer_hangs() { + const N: u32 = 4; + let p = Arc::new(Phaser::new(N)); + // Only 2 of 4 arrive — others "hang". + let p1 = p.clone(); + let h1 = thread::spawn(move || { + p1.arrive_and_wait_timeout(Duration::from_millis(50)) + }); + let p2 = p.clone(); + let h2 = thread::spawn(move || { + p2.arrive_and_wait_timeout(Duration::from_millis(50)) + }); + assert_eq!(h1.join().unwrap(), PhaserOutcome::Timeout); + assert_eq!(h2.join().unwrap(), PhaserOutcome::Timeout); + } + + /// Multi-phase stress: all participants run a tight loop of + /// arrive_and_wait calls; after K phases they all observe the same + /// `current_phase()` value. Catches generation/counter resync bugs. + #[test] + fn multi_phase_progress() { + const N: u32 = 6; + const K: u32 = 1000; + let p = Arc::new(Phaser::new(N)); + let counter = Arc::new(AtomicU32::new(0)); + let mut handles = Vec::new(); + for i in 0..N { + let p = p.clone(); + let c = counter.clone(); + handles.push( + thread::Builder::new() + .name(format!("phaser-multi-{i}")) + .spawn(move || { + for _ in 0..K { + assert_eq!( + p.arrive_and_wait(i as u8), + PhaserOutcome::Advanced + ); + } + c.fetch_add(1, Ordering::Relaxed); + }) + .unwrap(), + ); + } + for h in handles { + h.join().unwrap(); + } + assert_eq!(counter.load(Ordering::Relaxed), N); + assert_eq!(p.current_phase(), K); + } + + /// Mixed skip/arrive across phases — emulates the realistic scheduler + /// pattern where slots become idle for some quanta. + #[test] + fn mixed_skip_and_arrive_random() { + const N: u32 = 6; + const K: u32 = 200; + let p = Arc::new(Phaser::new(N)); + let mut handles = Vec::new(); + for i in 0..N { + let p = p.clone(); + handles.push( + thread::Builder::new() + .name(format!("phaser-mixed-{i}")) + .spawn(move || { + // Pseudo-random skip pattern based on slot+phase + let mut state: u32 = 0x9E37_79B9u32.wrapping_add(i); + for phase in 0..K { + state = state.wrapping_mul(0x6C8E_9CF7).wrapping_add(phase); + if state & 0xF == 0 { + p.skip(i as u8); + } else { + let _ = p.arrive_and_wait(i as u8); + } + } + }) + .unwrap(), + ); + } + for h in handles { + h.join().unwrap(); + } + // After K rounds with all-N participation each phase, the phase + // counter equals K. Each iteration contributes exactly N to the + // counter (split between arrive and skip). + assert_eq!(p.current_phase(), K); + } +} diff --git a/crates/xenia-cpu/src/reservation.rs b/crates/xenia-cpu/src/reservation.rs new file mode 100644 index 0000000..247b8a9 --- /dev/null +++ b/crates/xenia-cpu/src/reservation.rs @@ -0,0 +1,424 @@ +//! Inter-thread reservation table for `lwarx`/`stwcx.` and +//! `ldarx`/`stdcx.`. +//! +//! On real Xenon, each core's `lwarx` places a reservation on a 128-byte +//! cache line; any other CPU's store to the line invalidates the +//! reservation. `stwcx.`'s success depends on the reservation still being +//! valid. Under M3's per-HW-thread parallelism, we need an inter-thread +//! mechanism for the same guarantee. +//! +//! M2 introduces the table behind a runtime `reservations_enabled` flag +//! (default `false`). When the flag is `false`, the interpreter's +//! existing per-`PpcContext` `reserved_line`/`has_reservation` fields are +//! used as-is — no inter-thread tracking. M3 flips the flag on once the +//! per-HW-thread host threads are spawning. +//! +//! ## Design +//! +//! - **Banked AtomicU64 array** of [`NUM_LINES`] entries (4096 × 8 B = +//! 32 KiB total). Each entry packs `(line_address, generation, +//! hw_id)`. A zero value means "no reservation on this bank". +//! - **Hash function**: `(line >> 7) & (NUM_LINES - 1)`. Different lines +//! that map to the same bank conservatively invalidate each other's +//! reservations — sound (real Xenon's L2 has finite associativity and +//! has the same property), at the cost of slightly more `stwcx.` +//! failures than a perfect-mapping table would produce. +//! - **`active_reservers: AtomicU16`** — a fast-path counter +//! incremented by every `lwarx` and decremented when its reservation is +//! either committed or invalidated. `write_u32` checks this with a +//! single `Relaxed` load; when zero (the common case in code that +//! doesn't use atomics), the invalidation hook is a one-instruction +//! skip. +//! - **Generation counter**: monotonic across all reservations, +//! incremented atomically. 24 bits of generation packed in the slot +//! means 16 M reuses per slot before wraparound; at multi-million +//! reservations/sec sustained that's still many seconds, and a +//! stale-gen `stwcx.` simply fails (sound, not livelocking). +//! +//! ## Invariants +//! +//! 1. A `stwcx.(addr)` succeeds only if the line slot still holds the +//! same `(line, gen, hw_id)` triple the reserver stamped at `lwarx`. +//! 2. Any plain store to a reserved line invalidates it (slot CASed to +//! zero). Hash-collision side-effect: a store to a different line +//! that maps to the same bank also invalidates — guests that observe +//! a `stwcx.` failure simply retry, so this is correctness-preserving. +//! 3. `stwcx.` from a different `hw_id` than the reserver fails even if +//! the line and gen would otherwise match — only the originating HW +//! thread can commit its own reservation. +//! +//! Memory ordering: all CAS / store operations on the line slot use +//! `AcqRel`; readers use `Acquire`. The store inside `stwcx.`'s payload +//! itself (the actual data write) is the caller's responsibility — see +//! [`crate::interpreter`]'s `stwcx.` arm. + +use std::sync::atomic::{AtomicU16, AtomicU64, Ordering}; + +/// Real Xenon L2 cache-line size — the granule a reservation covers. +pub const LINE_BYTES: u32 = 0x80; +/// Mask to align an address to a cache-line boundary. +pub const LINE_MASK: u32 = !(LINE_BYTES - 1); +/// Number of bank entries in the reservation table. Power of two so the +/// hash is a single AND. 32 KiB total at 8 B per entry. +pub const NUM_LINES: usize = 4096; +const HASH_MASK: u32 = (NUM_LINES as u32) - 1; + +/// Pack `(line_addr, generation, hw_id)` into a single u64. The packed +/// layout is: +/// bits 63..32: line address (we only need the high bits since the +/// low 7 are always zero — reserved range is line-aligned) +/// bits 31..8: 24-bit generation +/// bits 7..0: 8-bit `hw_id` +/// +/// A packed value of `0` means "no reservation". Since we never reserve +/// on guest virtual address `0` (the page is unmapped) and the +/// generation increments from `1`, zero is a safe sentinel. +#[inline] +pub fn pack(line_addr: u32, generation: u32, hw_id: u8) -> u64 { + debug_assert!(line_addr & !LINE_MASK == 0, "line_addr must be line-aligned"); + debug_assert!(generation < (1 << 24), "generation must fit in 24 bits"); + ((line_addr as u64) << 32) + | ((generation as u64 & 0xFF_FFFF) << 8) + | (hw_id as u64) +} + +/// Inverse of [`pack`]. Returns `None` if the value is the zero sentinel +/// (no reservation). +#[inline] +pub fn unpack(raw: u64) -> Option<(u32, u32, u8)> { + if raw == 0 { + return None; + } + let line = (raw >> 32) as u32; + let generation = ((raw >> 8) & 0xFF_FFFF) as u32; + let hw_id = (raw & 0xFF) as u8; + Some((line, generation, hw_id)) +} + +#[inline] +fn hash(line_addr: u32) -> usize { + ((line_addr >> 7) & HASH_MASK) as usize +} + +#[inline] +fn align_to_line(addr: u32) -> u32 { + addr & LINE_MASK +} + +/// Banked reservation table shared across all emulated HW threads. Built +/// once per emulation instance; lives behind an `Arc` so worker host +/// threads (M3) can hold their own clones without lifetime gymnastics. +pub struct ReservationTable { + lines: Vec, + active_reservers: AtomicU16, + next_gen: AtomicU64, + /// Runtime activation flag. Default `false`. M2.8's + /// `--reservations-table` flag (or M3 spawn) flips this to `true`, + /// at which point the interpreter's `lwarx`/`stwcx.` arms route + /// through the table; otherwise they use the legacy per-`PpcContext` + /// reservation fields. + enabled: std::sync::atomic::AtomicBool, +} + +impl Default for ReservationTable { + fn default() -> Self { + Self::new() + } +} + +impl ReservationTable { + /// Construct a fresh table with all banks empty. + pub fn new() -> Self { + let mut lines = Vec::with_capacity(NUM_LINES); + for _ in 0..NUM_LINES { + lines.push(AtomicU64::new(0)); + } + Self { + lines, + active_reservers: AtomicU16::new(0), + // Start at 1 so the very first reservation gets a non-zero + // gen and the packed slot value is non-zero (zero is the + // "no reservation" sentinel). + next_gen: AtomicU64::new(1), + enabled: std::sync::atomic::AtomicBool::new(false), + } + } + + /// Activate the table. The interpreter's `lwarx`/`stwcx.` arms will + /// route through this table on subsequent dispatches. Idempotent. + pub fn enable(&self) { + self.enabled + .store(true, std::sync::atomic::Ordering::Release); + } + + /// Deactivate the table. The interpreter falls back to per-`PpcContext` + /// reservation fields. Idempotent. + pub fn disable(&self) { + self.enabled + .store(false, std::sync::atomic::Ordering::Release); + } + + /// Whether the table is currently active. The interpreter consults + /// this on every `lwarx`/`stwcx.` to decide which path runs. + pub fn is_enabled(&self) -> bool { + self.enabled.load(std::sync::atomic::Ordering::Acquire) + } + + /// True when at least one reservation is currently outstanding. + /// Plain `write_u32` consults this to skip the invalidation hook + /// when no thread holds a reservation — the common case for + /// non-atomic code. + #[inline] + pub fn has_active_reservers(&self) -> bool { + self.active_reservers.load(Ordering::Relaxed) > 0 + } + + /// `lwarx(addr)` — claim a reservation on the line containing `addr`. + /// Returns the generation stamped into the slot; the interpreter + /// stores this alongside the per-`PpcContext` `has_reservation` bit + /// so a subsequent `stwcx.` can verify the same gen still holds. + /// + /// If a different reservation already occupied the bank, it's + /// silently overwritten — that thread's `stwcx.` will fail because + /// the slot no longer matches its stamped gen. Matches Xenon + /// behavior (a different core's lwarx on the same line displaces + /// any prior reservation). + pub fn reserve(&self, addr: u32, hw_id: u8) -> u32 { + let line = align_to_line(addr); + let generation = (self + .next_gen + .fetch_add(1, Ordering::Relaxed) + & 0xFF_FFFF) as u32; + let new_raw = pack(line, generation, hw_id); + // Release: prior reads of the reservation target should + // happen-before any thread that observes the new slot value. + let prev = self.lines[hash(line)].swap(new_raw, Ordering::AcqRel); + // If the previous slot was non-zero, the displaced reserver is + // implicitly invalidated — decrement the active counter for it. + // Else, increment for our new reservation. Net effect: the + // counter equals the number of *bank slots* with a non-zero + // value, which is an upper bound on actual reservers. + if prev == 0 { + self.active_reservers.fetch_add(1, Ordering::Relaxed); + } + generation + } + + /// `stwcx.(addr)` — try to commit a reservation. Returns `true` if + /// the slot still holds `(line, my_gen, my_hw_id)` (in which case + /// it's CAS'd back to zero, releasing the bank), `false` otherwise. + /// The data store itself is the caller's responsibility — see + /// [`crate::interpreter`]'s `stwcx.` arm. + pub fn try_commit(&self, addr: u32, my_gen: u32, my_hw_id: u8) -> bool { + let line = align_to_line(addr); + let expected = pack(line, my_gen, my_hw_id); + match self.lines[hash(line)].compare_exchange( + expected, + 0, + Ordering::AcqRel, + Ordering::Relaxed, + ) { + Ok(_) => { + // Successfully released the slot; decrement the active + // count. + self.active_reservers.fetch_sub(1, Ordering::Relaxed); + true + } + Err(_) => false, + } + } + + /// Hook for plain (non-reserving) stores: invalidate any + /// reservation on the containing line. Cheap when the bank is + /// already empty (single Acquire load + branch). + pub fn invalidate_for_write(&self, addr: u32) { + let line = align_to_line(addr); + let bank = &self.lines[hash(line)]; + let prev = bank.load(Ordering::Acquire); + if prev == 0 { + return; + } + // Verify the slot still holds a reservation on *this* line + // before clearing — hash collisions mean the bank may hold a + // reservation on an unrelated line that maps to the same slot. + // Real Xenon has the same property (limited L2 associativity); + // we mirror it here. A spurious bank match invalidates a + // different line's reservation; the affected `stwcx.` retries — + // sound, slightly less efficient. + if let Some((bank_line, _generation, _hw)) = unpack(prev) { + if bank_line != line { + // Different line in the same bank — leave it alone (we + // chose not to invalidate cross-line collisions to + // reduce false-fail noise; real-HW behavior is similar + // since L2 associativity sets cross-line constraints). + return; + } + } + // CAS-clear the bank if it still holds the value we observed. + // If a concurrent `stwcx.` or `reserve` raced with us, the CAS + // fails — that's fine; the line slot is now in a different + // state and the displaced reservation will be picked up there. + if bank + .compare_exchange(prev, 0, Ordering::AcqRel, Ordering::Relaxed) + .is_ok() + { + self.active_reservers.fetch_sub(1, Ordering::Relaxed); + } + } + + /// Drop a per-`PpcContext` reservation without committing. Called + /// when the interpreter clears `has_reservation` due to a + /// non-`stwcx.` event (context switch, exception, etc.). Safe to + /// call when the table doesn't hold our reservation anymore (the + /// CAS simply fails). + pub fn release(&self, addr: u32, my_gen: u32, my_hw_id: u8) { + let _ = self.try_commit(addr, my_gen, my_hw_id); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Arc; + use std::thread; + + #[test] + fn pack_unpack_roundtrip() { + let raw = pack(0x1000_0000, 42, 5); + let (line, generation, hw) = unpack(raw).unwrap(); + assert_eq!(line, 0x1000_0000); + assert_eq!(generation, 42); + assert_eq!(hw, 5); + } + + #[test] + fn unpack_zero_is_none() { + assert!(unpack(0).is_none()); + } + + #[test] + fn reserve_then_commit_succeeds() { + let t = ReservationTable::new(); + let gn = t.reserve(0x1234, 0); + assert!(t.try_commit(0x1234, gn, 0)); + // Already released — second commit fails. + assert!(!t.try_commit(0x1234, gn, 0)); + } + + #[test] + fn other_hw_id_cannot_commit() { + let t = ReservationTable::new(); + let gn = t.reserve(0x1234, 0); + assert!( + !t.try_commit(0x1234, gn, 1), + "stwcx. from a different hw_id must fail" + ); + // Original owner can still commit. + assert!(t.try_commit(0x1234, gn, 0)); + } + + #[test] + fn lwarx_displaces_prior_reservation() { + let t = ReservationTable::new(); + let g0 = t.reserve(0x1234, 0); + // Different HW thread's lwarx on the same line. + let g1 = t.reserve(0x1234, 1); + // Original reserver's stwcx. fails because the gen changed. + assert!(!t.try_commit(0x1234, g0, 0)); + // New reserver's stwcx. succeeds. + assert!(t.try_commit(0x1234, g1, 1)); + } + + #[test] + fn invalidate_clears_matching_reservation() { + let t = ReservationTable::new(); + let gn = t.reserve(0x1234, 0); + t.invalidate_for_write(0x1238); // same line as 0x1234 + assert!(!t.try_commit(0x1234, gn, 0)); + assert_eq!(t.active_reservers.load(Ordering::Relaxed), 0); + } + + #[test] + fn invalidate_different_line_in_same_bank_is_noop() { + let t = ReservationTable::new(); + // Force a hash collision: addr A and addr B with same hash but + // different line addresses. + let line_a = 0x0000_1000; + let line_b = line_a + ((NUM_LINES as u32) << 7); // +0x80000 → same hash + assert_eq!(hash(line_a), hash(line_b)); + let gn = t.reserve(line_a, 0); + // Invalidating line_b must NOT clear line_a's reservation. + t.invalidate_for_write(line_b); + assert!(t.try_commit(line_a, gn, 0)); + } + + #[test] + fn has_active_reservers_tracks_count() { + let t = ReservationTable::new(); + assert!(!t.has_active_reservers()); + let g0 = t.reserve(0x1000, 0); + assert!(t.has_active_reservers()); + let g1 = t.reserve(0x2000, 1); + assert!(t.has_active_reservers()); + t.try_commit(0x1000, g0, 0); + assert!(t.has_active_reservers()); + t.try_commit(0x2000, g1, 1); + assert!(!t.has_active_reservers()); + } + + /// Stress test: 8 host threads each loop reserve+stwcx on the same + /// line. Exactly one stwcx per round can win; the others fail and + /// retry. The total number of *successful* commits across N + /// outer iterations equals N (one winner per round). + /// + /// This proves the table's mutual-exclusion property: at most one + /// thread's stwcx. on a given line can succeed between two events + /// that would invalidate the line. + #[test] + fn concurrent_lwarx_stwcx_serializes() { + let t = Arc::new(ReservationTable::new()); + const ROUNDS: u32 = 1000; + const THREADS: u8 = 8; + let total_successes = Arc::new(AtomicU64::new(0)); + + let mut handles = Vec::new(); + for hw_id in 0..THREADS { + let t_clone = t.clone(); + let s_clone = total_successes.clone(); + handles.push( + thread::Builder::new() + .name(format!("res-stress-{hw_id}")) + .spawn(move || { + let mut wins = 0u64; + for _ in 0..ROUNDS { + let gn = t_clone.reserve(0x1234_5678, hw_id); + if t_clone.try_commit(0x1234_5678, gn, hw_id) { + wins += 1; + } + } + s_clone.fetch_add(wins, Ordering::Relaxed); + }) + .expect("spawn"), + ); + } + for h in handles { + h.join().expect("join"); + } + let total = total_successes.load(Ordering::Relaxed); + // Lower bound: every round had at least one winner — but races + // can cause some rounds to have zero (all threads' reservations + // got displaced before any could commit). Assert progress: at + // least 10% of attempts succeed, and active_reservers is back + // to zero. + let attempts = ROUNDS as u64 * THREADS as u64; + assert!( + total > attempts / 10, + "expected at least 10% successful commits, got {total}/{attempts}" + ); + assert_eq!( + t.active_reservers.load(Ordering::Relaxed), + 0, + "all reservations should have been resolved" + ); + } +} diff --git a/crates/xenia-cpu/src/scheduler.rs b/crates/xenia-cpu/src/scheduler.rs new file mode 100644 index 0000000..1f1d68a --- /dev/null +++ b/crates/xenia-cpu/src/scheduler.rs @@ -0,0 +1,1919 @@ +//! Round-robin scheduler over 6 HW threads with per-slot runqueues. +//! +//! Execution is serialized on a single host thread (the interpreter thread; +//! `GuestMemory` is pinned and deliberately not thread-safe). The scheduler +//! is a pure data container — kernel code parks, wakes, and mutates state +//! through its public methods; it knows nothing about kernel objects. +//! +//! ## Model (post-Axis-1) +//! +//! - `HW_THREAD_COUNT = 6`, matching real Xenon hardware (3 cores × 2 SMT). +//! - Each `HwSlot` carries a runqueue `Vec` — any state, +//! `pick_runnable` filters Ready/ServicingIrq when choosing the live thread. +//! - A `GuestThread` owns its own `PpcContext` inline. The live register +//! file is always whichever thread the slot has pinned as running — no +//! memcpy on context switch. +//! - `ThreadRef { hw_id, idx }` is the stable identity used in waiter lists +//! and anywhere a specific thread needs to be addressed across slot +//! boundaries. Positional refs are cheap but **must** be fixed up after +//! `swap_remove` (Axis 4 affinity migration does this explicitly). +//! +//! Every scheduler round: for each slot with a runnable thread, pick the +//! highest-priority Ready thread and advance it one guest instruction (or +//! one import-thunk dispatch). Blocked/Exited threads stay resident in the +//! runqueue so their `ThreadRef` doesn't shift under kernel waiter lists. + +use crate::context::PpcContext; + +/// Number of emulated HW threads. Real Xbox 360 Xenon = 3 cores × 2 SMT = 6. +pub const HW_THREAD_COUNT: usize = 6; + +/// Guest thread id assigned to the initial (module-entry) guest thread. +pub const INITIAL_GUEST_TID: u32 = 1; + +/// Default per-thread instruction quantum. Consumed by Axis 3 (`decrement_quantum`); +/// Axis 1 carries the field on every thread but doesn't decrement yet. +pub const QUANTUM_DEFAULT: u32 = 50_000; + +/// Above this depth, `spawn` prunes `Exited` entries from a slot's runqueue +/// before pushing the new thread. Keeps peer `ThreadRef`s stable on the +/// common (low-depth) path — a game that spawns a handful of long-lived +/// workers never triggers a compaction; a game that rapidly churns threads +/// gets one when the slot fills up. +const PRUNE_DEPTH_THRESHOLD: usize = 4; + +/// Stable identity for a guest thread across all scheduler tables. +/// +/// The positional `idx` is only valid while the source slot's runqueue +/// has not been mutated by a `swap_remove`. All sites that do so (Axis 4 +/// affinity migration, `prune_exited`) must fix up every `ThreadRef` they +/// invalidate. +/// +/// **M2.3 generation packing.** Under M3's per-HW-thread parallelism, an +/// `idx` reused after `swap_remove` could match a stale `ThreadRef` held +/// in another thread's waiter list (the classic ABA hazard). The +/// `generation` byte distinguishes such reuses. M2 introduces the field +/// (set to `0` on fresh spawns) without yet bumping it — no concurrent +/// remove paths exist before M3. The migration-fixup site at +/// [`MigrationFixup::apply`] will bump generations once M3 lands. +/// +/// Layout: 1 + 1 + 2 = 4 bytes (no padding). 256 reuses per slot before +/// wraparound; with `PRUNE_DEPTH_THRESHOLD = 4` keeping slots shallow, +/// that is well above any realistic churn rate. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Default)] +pub struct ThreadRef { + pub hw_id: u8, + pub generation: u8, + pub idx: u16, +} + +impl ThreadRef { + /// Construct a `ThreadRef` with `generation = 0`. Used by every + /// fresh-spawn / re-bind site that doesn't track generations + /// directly. Sites that DO track generations (the migration-fixup + /// path under M3) construct via struct literals so they're greppable. + pub const fn new(hw_id: u8, idx: u16) -> Self { + Self { + hw_id, + idx, + generation: 0, + } + } + + /// Construct a `ThreadRef` with an explicit generation. Used by the + /// migration-fixup path at M3. + pub const fn with_generation(hw_id: u8, idx: u16, generation: u8) -> Self { + Self { + hw_id, + idx, + generation, + } + } +} + +/// A guest thread and everything needed to schedule, park, and wake it. +pub struct GuestThread { + pub ctx: PpcContext, + pub state: HwState, + pub tid: u32, + pub thread_handle: Option, + pub stack_base: u32, + pub stack_size: u32, + pub pcr_base: u32, + pub tls_base: u32, + /// Per-thread TLS slot values; `KeTlsGetValue/SetValue` route through + /// `Scheduler::tls_{get,set}` which index this on the currently-running thread. + pub tls_values: Vec, + /// Suspend counter — `NtSuspendThread` increments, `NtResumeThread` + /// decrements, only unblocks at zero. + pub suspend_count: u32, + /// NT-style priority, signed. Higher wins within a slot. Default 0. + pub priority: i32, + /// Set bit i = thread may run on slot i. 0 normalizes to 0xFF (any). + pub affinity_mask: u8, + /// Hint from `KeSetIdealProcessor`. Axis 5 honors on spawn; Axis 1 + /// carries the field. + pub ideal_processor: Option, + /// Axis 3 instruction budget. Decremented per retired step on this + /// thread; on zero, slot rotates within same-priority tier. + pub quantum_remaining: u32, +} + +impl GuestThread { + fn default_fields() -> Self { + Self { + ctx: PpcContext::new(), + state: HwState::Idle, + tid: 0, + thread_handle: None, + stack_base: 0, + stack_size: 0, + pcr_base: 0, + tls_base: 0, + tls_values: Vec::new(), + suspend_count: 0, + priority: 0, + affinity_mask: 0xFF, + ideal_processor: None, + quantum_remaining: QUANTUM_DEFAULT, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum HwState { + /// Slot slot has no running thread (used only for `HwSlot::idle_ctx`'s + /// conceptual state — live threads never sit in `Idle`). + Idle, + Ready, + Blocked(BlockReason), + Exited(u32), + /// Graphics-interrupt servicing state. The thread was + /// `Blocked(reason)` when an IRQ was injected; we flipped it to + /// `ServicingIrq(reason)` so the scheduler will run the callback, + /// carrying the prior block reason for the IRQ-return path to consult. + /// On return to `LR_HALT_SENTINEL` the main loop restores to + /// `Blocked(reason)` — **unless** something during the callback + /// (e.g. `KeSetEvent → wake`) flipped this to `Ready`, in which case + /// the wait was resolved and we leave it runnable. + ServicingIrq(BlockReason), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BlockReason { + Suspended, + WaitAny { + handles: Vec, + deadline: Option, + }, + WaitAll { + handles: Vec, + deadline: Option, + }, + DelayUntil(u64), + CriticalSection(u32), +} + +/// Sink for PCR+0x2C writes — the scheduler writes the guest-visible +/// current-processor-id here at spawn and Axis 4 rewrites on affinity +/// migration. Implemented by `xenia-kernel` for `GuestMemory`; keeping it +/// an abstract trait avoids pulling `xenia_memory` into `xenia_cpu`. +pub trait PcrWriter { + fn write_pcr_id(&mut self, pcr_base: u32, hw_id: u8); +} + +/// Per-slot runqueue + the index of the thread currently pinned-running. +pub struct HwSlot { + pub runqueue: Vec, + pub running_idx: Option, + /// Sentinel context returned by compat accessors when the slot has no + /// running thread. Keeps the `ctx(hw_id)` API safe from diagnostic + /// paths that run between scheduling passes. + idle_ctx: PpcContext, + /// Same-shape sentinel state. + idle_state: HwState, +} + +impl Default for HwSlot { + fn default() -> Self { + Self { + runqueue: Vec::new(), + running_idx: None, + idle_ctx: PpcContext::new(), + idle_state: HwState::Idle, + } + } +} + +impl HwSlot { + /// Index of the highest-priority Ready/ServicingIrq thread in this + /// slot's runqueue. Tiebreak: prefer lower index (deterministic). + pub fn pick_runnable(&self) -> Option { + self.runqueue + .iter() + .enumerate() + .filter(|(_, t)| matches!(t.state, HwState::Ready | HwState::ServicingIrq(_))) + .max_by_key(|(i, t)| (t.priority, -(*i as i64))) + .map(|(i, _)| i) + } + + /// How many non-Exited threads currently live on this slot (used by + /// placement policies). + pub fn live_depth(&self) -> usize { + self.runqueue + .iter() + .filter(|t| !matches!(t.state, HwState::Exited(_))) + .count() + } + +} + +#[derive(Debug, Clone, Copy)] +pub enum OrderMode { + Fixed, + Seeded { seed: u64 }, +} + +impl OrderMode { + pub fn from_env() -> Self { + match std::env::var("XENIA_SCHED_ORDER").ok().as_deref() { + Some("random") | Some("Random") | Some("RANDOM") => { + let seed = std::env::var("XENIA_SCHED_SEED") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(0xC0FFEE_C0FFEE); + OrderMode::Seeded { seed } + } + _ => OrderMode::Fixed, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RoundOutcome { + Progressed, + Slept, + Deadlock, + MainExited, +} + +/// Parameters for `Scheduler::spawn`. The caller allocates the stack/PCR/ +/// TLS blocks in guest memory first, then hands these addresses here. +#[derive(Debug)] +pub struct SpawnParams { + pub entry: u32, + pub start_context: u32, + pub stack_base: u32, + pub stack_size: u32, + pub pcr_base: u32, + pub tls_base: u32, + pub thread_handle: u32, + pub guest_tid: u32, + pub create_suspended: bool, + pub is_initial: bool, + pub tls_slot_count: u32, + /// Set bit i = thread may land on slot i. 0 normalizes to 0xFF. + pub affinity_mask: u8, + /// NT-style signed priority; default 0. + pub priority: i32, + /// Preferred slot; Axis 5 spawn honors if allowed by affinity mask. + pub ideal_processor: Option, +} + +impl Default for SpawnParams { + fn default() -> Self { + Self { + entry: 0, + start_context: 0, + stack_base: 0, + stack_size: 0, + pcr_base: 0, + tls_base: 0, + thread_handle: 0, + guest_tid: 0, + create_suspended: false, + is_initial: false, + tls_slot_count: 0, + affinity_mask: 0xFF, + priority: 0, + ideal_processor: None, + } + } +} + +#[derive(Debug)] +pub enum SpawnError { + NoFreeHwThread, +} + +/// Side information returned by `set_affinity_ref` so the kernel layer +/// can walk its waiter lists and retarget any `ThreadRef`s invalidated +/// by the `swap_remove` on the source slot. +#[derive(Debug, Copy, Clone)] +pub struct MigrationFixup { + pub source_hw: u8, + pub promoted_old_idx: u16, + pub promoted_new_idx: u16, + pub migrated_old_ref: ThreadRef, + pub migrated_new_ref: ThreadRef, +} + +impl MigrationFixup { + /// Apply the fixup to a single `ThreadRef` reference. Idempotent. + pub fn apply(&self, r: &mut ThreadRef) { + if *r == self.migrated_old_ref { + *r = self.migrated_new_ref; + } else if r.hw_id == self.source_hw && r.idx == self.promoted_old_idx { + r.idx = self.promoted_new_idx; + } + } +} + +pub struct Scheduler { + pub slots: [HwSlot; HW_THREAD_COUNT], + pub round_count: u64, + /// Currently-stepping thread. Set by `begin_slot_visit`, cleared by + /// `end_slot_visit`. Kernel exports reach through this to learn which + /// thread they're running on. + pub current: Option, + order: OrderMode, + rng_state: u64, + /// Sorted by deadline ascending. Scheduler wakes the first entry via + /// `advance_to_next_wake` when a round finds nothing runnable. + timed_waits: Vec<(u64, ThreadRef)>, + /// Global count of TLS slots allocated — `spawn` pre-sizes new threads' + /// `tls_values` to this. + tls_slot_count: usize, + /// Axis 2: bit i set ⇒ slot i has at least one Ready/ServicingIrq + /// thread. `round_schedule` uses this to skip empty slots cheaply; + /// maintained by state-mutating methods via `recompute_slot_runnable`. + non_empty_runnable: u8, + /// Axis 2: rolling round-robin cursor. Each `round_schedule` call + /// emits slot ids starting at `(rotation_cursor + i) % 6`, then + /// advances the cursor by one so the next round begins from the + /// following slot. Guarantees every non-empty slot gets an equal + /// share of round leads over time. + rotation_cursor: u8, + /// M3.7 — optional reservation table installed by the kernel after + /// scheduler construction. When present, [`Self::spawn`] and + /// [`Self::install_initial_thread`] populate each `PpcContext`'s + /// `reservation_table` field so the interpreter's `lwarx`/`stwcx.` + /// arms can route through the table. + reservation_table: Option>, +} + +impl Scheduler { + /// Build a scheduler with all slots empty. Callers (usually + /// `KernelState::install_initial_thread`) push the initial guest + /// thread onto slot 0 before stepping. + pub fn new() -> Self { + let order = OrderMode::from_env(); + let rng_state = match order { + OrderMode::Fixed => 0, + OrderMode::Seeded { seed } => seed.max(1), + }; + Scheduler { + slots: std::array::from_fn(|_| HwSlot::default()), + round_count: 0, + current: None, + order, + rng_state, + timed_waits: Vec::new(), + tls_slot_count: 0, + non_empty_runnable: 0, + rotation_cursor: 0, + reservation_table: None, + } + } + + /// M3.7 — install a shared reservation table. Subsequent + /// `spawn`/`install_initial_thread` calls will populate each + /// `PpcContext::reservation_table` with a clone. Idempotent; + /// passing `None` clears the binding (existing threads keep their + /// previously-cloned Arcs). + pub fn set_reservation_table( + &mut self, + table: Option>, + ) { + self.reservation_table = table; + } + + /// Recompute the runnable bit for one slot. Cheap — scans the slot's + /// runqueue once. Call at the tail of any method that may change + /// whether the slot has a Ready/ServicingIrq member. + fn recompute_slot_runnable(&mut self, hw_id: u8) { + let any = self.slots[hw_id as usize] + .runqueue + .iter() + .any(|t| matches!(t.state, HwState::Ready | HwState::ServicingIrq(_))); + if any { + self.non_empty_runnable |= 1 << hw_id; + } else { + self.non_empty_runnable &= !(1 << hw_id); + } + } + + // ----- Compat accessors (preserve the pre-Axis-1 hw_threads[i].ctx pattern) ----- + + /// Read-only context of the currently-running thread on `hw_id`. + pub fn ctx(&self, hw_id: u8) -> &PpcContext { + let slot = &self.slots[hw_id as usize]; + match slot.running_idx { + Some(i) if i < slot.runqueue.len() => &slot.runqueue[i].ctx, + _ => &slot.idle_ctx, + } + } + + /// Mutable context of the currently-running thread on `hw_id`. + pub fn ctx_mut(&mut self, hw_id: u8) -> &mut PpcContext { + let slot = &mut self.slots[hw_id as usize]; + match slot.running_idx { + Some(i) if i < slot.runqueue.len() => &mut slot.runqueue[i].ctx, + _ => &mut slot.idle_ctx, + } + } + + /// Mutable context addressed by `ThreadRef` — bypasses `running_idx` + /// so callers (deadlock-recovery, `call_export` return, Axis 4 + /// migration) can touch a specific thread even when it isn't the one + /// the slot has pinned. + pub fn ctx_mut_ref(&mut self, r: ThreadRef) -> &mut PpcContext { + &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize].ctx + } + + pub fn state(&self, hw_id: u8) -> &HwState { + let slot = &self.slots[hw_id as usize]; + match slot.running_idx { + Some(i) if i < slot.runqueue.len() => &slot.runqueue[i].state, + _ => &slot.idle_state, + } + } + + pub fn state_mut(&mut self, hw_id: u8) -> &mut HwState { + let slot = &mut self.slots[hw_id as usize]; + match slot.running_idx { + Some(i) if i < slot.runqueue.len() => &mut slot.runqueue[i].state, + _ => &mut slot.idle_state, + } + } + + pub fn tid(&self, hw_id: u8) -> Option { + let slot = &self.slots[hw_id as usize]; + slot.running_idx.and_then(|i| slot.runqueue.get(i).map(|t| t.tid)) + } + + pub fn thread_handle(&self, hw_id: u8) -> Option { + let slot = &self.slots[hw_id as usize]; + slot.running_idx + .and_then(|i| slot.runqueue.get(i).and_then(|t| t.thread_handle)) + } + + pub fn tls_values(&self, hw_id: u8) -> Option<&Vec> { + let slot = &self.slots[hw_id as usize]; + slot.running_idx.and_then(|i| slot.runqueue.get(i).map(|t| &t.tls_values)) + } + + pub fn suspend_count_mut(&mut self, hw_id: u8) -> Option<&mut u32> { + let slot = &mut self.slots[hw_id as usize]; + match slot.running_idx { + Some(i) if i < slot.runqueue.len() => Some(&mut slot.runqueue[i].suspend_count), + _ => None, + } + } + + /// Compat: most pre-Axis-1 code reaches for `current_hw_id` as an + /// `Option`. We keep it as a method that derives from `current`. + #[inline] + pub fn current_hw_id(&self) -> Option { + self.current.map(|r| r.hw_id) + } + + /// Panics if called outside a step. + #[inline] + pub fn current(&self) -> u8 { + self.current.expect("no current thread").hw_id + } + + /// Panics if called outside a step. + #[inline] + pub fn current_ref(&self) -> ThreadRef { + self.current.expect("no current thread") + } + + // ----- Guest-thread lookup ----- + + /// Find the `ThreadRef` of the (non-Exited) thread with `tid`. + pub fn find_by_tid(&self, tid: u32) -> Option { + for (hw_id, slot) in self.slots.iter().enumerate() { + for (idx, t) in slot.runqueue.iter().enumerate() { + if t.tid == tid && !matches!(t.state, HwState::Exited(_)) { + return Some(ThreadRef::new(hw_id as u8, idx as u16)); + } + } + } + None + } + + /// Find the `ThreadRef` of the (non-Exited) thread with `thread_handle`. + pub fn find_by_handle(&self, handle: u32) -> Option { + for (hw_id, slot) in self.slots.iter().enumerate() { + for (idx, t) in slot.runqueue.iter().enumerate() { + if t.thread_handle == Some(handle) + && !matches!(t.state, HwState::Exited(_)) + { + return Some(ThreadRef::new(hw_id as u8, idx as u16)); + } + } + } + None + } + + /// Thread pointer addressed by ThreadRef. Panics if the ref is out of + /// bounds — only call with refs sourced from a live scheduler lookup + /// (`find_by_*`, `current`). + pub fn thread(&self, r: ThreadRef) -> &GuestThread { + &self.slots[r.hw_id as usize].runqueue[r.idx as usize] + } + + pub fn thread_mut(&mut self, r: ThreadRef) -> &mut GuestThread { + &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize] + } + + /// Bounds-checked variant for code paths that accept potentially-stale + /// refs from external storage (waiter lists that may survive a slot + /// compaction, test fixtures). Returns None on out-of-bounds. + pub fn try_thread_mut(&mut self, r: ThreadRef) -> Option<&mut GuestThread> { + self.slots + .get_mut(r.hw_id as usize) + .and_then(|slot| slot.runqueue.get_mut(r.idx as usize)) + } + + // ----- Spawn ----- + + /// Install a new guest thread on an affinity-permitted slot with the + /// lowest live depth. Writes `PCR+0x2C = hw_id` via `mem`. Returns + /// the assigned `hw_id`. + /// + /// Initial threads land on slot 0 (hardware convention). + pub fn spawn( + &mut self, + params: SpawnParams, + mem: &mut W, + ) -> Result { + let mask = if params.affinity_mask == 0 { + 0xFF + } else { + params.affinity_mask + }; + + // Axis 5: placement order — initial always slot 0; explicit + // ideal (if the mask allows it) wins; otherwise least-depth + // among mask-allowed slots. + let slot_id: u8 = if params.is_initial { + 0 + } else if let Some(ideal) = params.ideal_processor + && (mask & (1u8 << ideal)) != 0 + { + ideal + } else { + self.pick_least_depth_slot(mask) + .ok_or(SpawnError::NoFreeHwThread)? + }; + + // Compact Exited entries if this slot is approaching saturation. + // Only safe to do when no ThreadRef outside of the scheduler is + // currently held to exited entries on this slot — kernel waiter + // lists drop refs when wake fires, and Exited threads are never + // picked for stepping, so compaction is a no-op for live peers. + self.prune_exited_if_needed(slot_id); + + let mut t = GuestThread::default_fields(); + t.ctx.pc = params.entry; + let sp_top = (params.stack_base as u64).saturating_add(params.stack_size as u64); + t.ctx.gpr[1] = sp_top.saturating_sub(0x100) & !0xFu64; + t.ctx.gpr[2] = 0x2000_0000; + t.ctx.gpr[3] = params.start_context as u64; + t.ctx.gpr[13] = params.pcr_base as u64; + t.ctx.msr = 0x9030; + t.ctx.thread_id = params.guest_tid; + t.tid = params.guest_tid; + t.thread_handle = Some(params.thread_handle); + t.state = if params.create_suspended { + HwState::Blocked(BlockReason::Suspended) + } else { + HwState::Ready + }; + t.stack_base = params.stack_base; + t.stack_size = params.stack_size; + t.pcr_base = params.pcr_base; + t.tls_base = params.tls_base; + let tls_count = params.tls_slot_count as usize; + let tls_count = tls_count.max(self.tls_slot_count); + t.tls_values = vec![0; tls_count]; + t.suspend_count = if params.create_suspended { 1 } else { 0 }; + t.priority = params.priority; + t.affinity_mask = mask; + t.ideal_processor = params.ideal_processor; + // M3.7 — populate the inter-thread reservation handle + slot id + // so the interpreter can route lwarx/stwcx through the table. + t.ctx.hw_id = slot_id; + t.ctx.reservation_table = self.reservation_table.clone(); + + self.slots[slot_id as usize].runqueue.push(t); + mem.write_pcr_id(params.pcr_base, slot_id); + self.recompute_slot_runnable(slot_id); + + tracing::info!( + "spawn: tid={} on hw={} entry={:#010x} start_ctx={:#010x} suspended={} pri={} mask={:#04x}", + params.guest_tid, + slot_id, + params.entry, + params.start_context, + params.create_suspended, + params.priority, + mask, + ); + Ok(slot_id) + } + + /// Install the initial (module-entry) guest thread on slot 0 with an + /// externally-prepared register file. Unlike `spawn`, this does not + /// reset ctx — the app has already set up MSR, r1/r13/etc. for the + /// XEX bootstrap. + pub fn install_initial_thread( + &mut self, + ctx: PpcContext, + stack_base: u32, + stack_size: u32, + pcr_base: u32, + tls_base: u32, + thread_handle: u32, + mem: &mut W, + ) { + let mut t = GuestThread::default_fields(); + t.ctx = ctx; + // M3.7 — initial thread on slot 0; same wiring as `spawn`. + t.ctx.hw_id = 0; + t.ctx.reservation_table = self.reservation_table.clone(); + t.state = HwState::Ready; + t.tid = INITIAL_GUEST_TID; + t.thread_handle = Some(thread_handle); + t.stack_base = stack_base; + t.stack_size = stack_size; + t.pcr_base = pcr_base; + t.tls_base = tls_base; + t.tls_values = vec![0; self.tls_slot_count]; + self.slots[0].runqueue.push(t); + mem.write_pcr_id(pcr_base, 0); + self.recompute_slot_runnable(0); + } + + /// Pick the slot with the smallest `live_depth` whose bit is set in + /// `mask`. Returns `None` only when `mask == 0` (malformed). + pub fn pick_least_depth_slot(&self, mask: u8) -> Option { + if mask == 0 { + return None; + } + (0..HW_THREAD_COUNT as u8) + .filter(|i| mask & (1 << i) != 0) + .min_by_key(|i| self.slots[*i as usize].live_depth()) + } + + /// Remove `Exited` entries from `slot_id`'s runqueue, but only when the + /// runqueue is deep enough that compaction is worthwhile. Because + /// `swap_remove` shifts indices, this is the only legal way to drop + /// entries — and it can invalidate outstanding `ThreadRef`s to the + /// affected slot. Callers are responsible for ensuring no live waiter + /// lists hold refs into exited entries (they don't, because waiter + /// wakeup always removes the ref and sets state to Ready before the + /// thread can exit again). + fn prune_exited_if_needed(&mut self, slot_id: u8) { + let slot = &mut self.slots[slot_id as usize]; + if slot.runqueue.len() < PRUNE_DEPTH_THRESHOLD { + return; + } + slot.runqueue + .retain(|t| !matches!(t.state, HwState::Exited(_))); + // running_idx may now be stale. Since we only prune at spawn time + // (not mid-round), and round boundaries re-pick running_idx via + // begin_slot_visit, clearing is safe. + slot.running_idx = None; + self.recompute_slot_runnable(slot_id); + } + + // ----- Round scheduling ----- + + /// Axis 2: emit slot ids with at least one runnable thread, starting + /// from `rotation_cursor` and cycling forward. `non_empty_runnable` is + /// the fast path — zero bits mean no slot has work and the caller + /// falls through to `advance_to_next_wake`. + pub fn round_schedule(&mut self) -> Vec { + if self.non_empty_runnable == 0 { + return Vec::new(); + } + let start = self.rotation_cursor as usize; + let mut out: Vec = Vec::with_capacity(HW_THREAD_COUNT); + for off in 0..HW_THREAD_COUNT { + let i = (start + off) % HW_THREAD_COUNT; + if self.non_empty_runnable & (1 << i) != 0 { + out.push(i as u8); + } + } + // Seeded mode layers a deterministic shuffle on top of the + // already-filtered list. Same spawn/wake sequence + same seed ⇒ + // same schedule (invariant preserved from pre-Axis-1). + if let OrderMode::Seeded { .. } = self.order { + for i in (1..out.len()).rev() { + self.rng_state ^= self.rng_state << 13; + self.rng_state ^= self.rng_state >> 7; + self.rng_state ^= self.rng_state << 17; + let j = (self.rng_state as usize) % (i + 1); + out.swap(i, j); + } + } + self.rotation_cursor = ((start + 1) % HW_THREAD_COUNT) as u8; + out + } + + pub fn begin_round(&mut self) { + self.round_count += 1; + } + + /// Called by the step loop at the top of each per-slot visit. Picks the + /// highest-priority Ready thread on the slot, sets `running_idx`, and + /// stashes `self.current` so exports can reach it. + pub fn begin_slot_visit(&mut self, hw_id: u8) { + let slot = &mut self.slots[hw_id as usize]; + slot.running_idx = slot.pick_runnable(); + self.current = slot + .running_idx + .map(|idx| ThreadRef::new(hw_id, idx as u16)); + } + + /// Clear `current` at the end of each per-slot visit. + pub fn end_slot_visit(&mut self) { + self.current = None; + } + + /// Axis 3: decrement the currently-running thread's instruction + /// quantum. On reach-zero, reload to `QUANTUM_DEFAULT` and rotate + /// `running_idx` to the next Ready thread on this slot that sits in + /// the same priority tier (hand-off preserves priority ordering). + /// The flip is observed by the *next* round's `begin_slot_visit` — + /// the step that just completed has already returned, so there's no + /// mid-instruction preemption hazard. + /// + /// Returns `true` if a rotation occurred (purely informational; + /// callers don't need to act on it). + pub fn decrement_quantum(&mut self) -> bool { + let Some(r) = self.current else { return false; }; + let slot = &mut self.slots[r.hw_id as usize]; + let Some(t) = slot.runqueue.get_mut(r.idx as usize) else { + return false; + }; + if t.quantum_remaining > 0 { + t.quantum_remaining -= 1; + } + if t.quantum_remaining != 0 { + return false; + } + let my_pri = t.priority; + t.quantum_remaining = QUANTUM_DEFAULT; + // Scan the rest of the runqueue for a same-priority Ready peer. + // Priority-higher peers are already going to win the next + // `pick_runnable` on this slot, so we only need to find an *equal* + // priority peer to enforce fair rotation within the tier. + let len = slot.runqueue.len(); + if len < 2 { + return false; + } + let start = (r.idx as usize + 1) % len; + for off in 0..len { + let i = (start + off) % len; + if i == r.idx as usize { + continue; + } + let cand = &slot.runqueue[i]; + if cand.priority == my_pri && matches!(cand.state, HwState::Ready) { + slot.running_idx = Some(i); + self.current = Some(ThreadRef::new(r.hw_id, i as u16)); + return true; + } + } + false + } + + // ----- Park / wake / exit ----- + + pub fn park_current(&mut self, reason: BlockReason) { + let r = self + .current + .expect("park_current called outside a step"); + let deadline = match &reason { + BlockReason::WaitAny { deadline, .. } | BlockReason::WaitAll { deadline, .. } => { + *deadline + } + BlockReason::DelayUntil(d) => Some(*d), + _ => None, + }; + if let Some(d) = deadline { + self.timed_waits.push((d, r)); + self.timed_waits.sort_by_key(|&(d, _)| d); + } + self.thread_mut(r).state = HwState::Blocked(reason); + self.recompute_slot_runnable(r.hw_id); + } + + /// Wake a specific thread (must be Blocked or ServicingIrq). Silently + /// no-ops on out-of-bounds refs — waiter lists are positional and may + /// outlive their target after a slot compaction; in debug builds we + /// warn so regressions of this class surface during development. + pub fn wake_ref(&mut self, r: ThreadRef) { + let Some(slot) = self.slots.get_mut(r.hw_id as usize) else { + debug_assert!(false, "wake_ref: hw_id out of bounds: {:?}", r); + return; + }; + let Some(t) = slot.runqueue.get_mut(r.idx as usize) else { + // Stale waiter ref — expected under normal operation when a + // waiter was enqueued from a test fixture or survived a slot + // compaction. Warn in debug builds. + #[cfg(debug_assertions)] + tracing::debug!("wake_ref: idx out of bounds: {:?}", r); + return; + }; + match &t.state { + HwState::Blocked(_) | HwState::ServicingIrq(_) => {} + _ => return, + } + t.state = HwState::Ready; + t.quantum_remaining = QUANTUM_DEFAULT; + self.timed_waits.retain(|&(_, tr)| tr != r); + self.recompute_slot_runnable(r.hw_id); + } + + /// Axis-4-friendly variant: look up the thread holding `handle` and wake it. + pub fn wake_by_handle(&mut self, handle: u32) -> Option { + let r = self.find_by_handle(handle)?; + self.wake_ref(r); + Some(r) + } + + /// Decrement suspend count on target; if it reaches 0, unblock. + /// Returns previous count. + pub fn resume_ref(&mut self, r: ThreadRef) -> u32 { + let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize]; + let prev = t.suspend_count; + if t.suspend_count > 0 { + t.suspend_count -= 1; + } + if t.suspend_count == 0 && matches!(t.state, HwState::Blocked(BlockReason::Suspended)) { + t.state = HwState::Ready; + t.quantum_remaining = QUANTUM_DEFAULT; + } + self.recompute_slot_runnable(r.hw_id); + prev + } + + pub fn suspend_ref(&mut self, r: ThreadRef) -> u32 { + let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize]; + let prev = t.suspend_count; + t.suspend_count += 1; + if matches!(t.state, HwState::Ready) { + t.state = HwState::Blocked(BlockReason::Suspended); + } + self.recompute_slot_runnable(r.hw_id); + prev + } + + /// Set base priority; returns prior value. + pub fn set_priority_ref(&mut self, r: ThreadRef, priority: i32) -> i32 { + let t = self.thread_mut(r); + let prev = t.priority; + t.priority = priority; + prev + } + + pub fn priority_ref(&self, r: ThreadRef) -> i32 { + self.thread(r).priority + } + + /// Axis 5: `KeSetIdealProcessor` — store the hint (does NOT migrate + /// a live thread; purely advisory for subsequent wake decisions, + /// which our cooperative scheduler doesn't currently consult — and + /// as spawn placement for any newly-created sibling threads). + /// Returns previous ideal (or 0xFF if unset). + pub fn set_ideal_ref(&mut self, r: ThreadRef, ideal: u8) -> u8 { + let t = self.thread_mut(r); + let prev = t.ideal_processor.unwrap_or(0xFF); + t.ideal_processor = Some(ideal); + prev + } + + pub fn ideal_ref(&self, r: ThreadRef) -> Option { + self.thread(r).ideal_processor + } + + /// Axis 4: Set the affinity mask on `r` and migrate between slot + /// runqueues if the current slot is no longer allowed by the mask. + /// Returns `(old_mask, new_ref, migration_info)`. `migration_info` is + /// `None` when no migration happened, `Some((src_promoted_old_idx, + /// src_promoted_new_idx))` when `swap_remove` moved a peer into the + /// migrated thread's slot — the caller must walk external waiter + /// containers and retarget any ref matching the promoted-old slot. + /// + /// `mask == 0` normalizes to `0xFF` (Canary parity: early Xbox code + /// sometimes passes 0 meaning "any"). + pub fn set_affinity_ref( + &mut self, + r: ThreadRef, + new_mask: u8, + mem: &mut W, + ) -> (u8, ThreadRef, Option) { + let old_mask = self.thread(r).affinity_mask; + let effective = if new_mask == 0 { 0xFF } else { new_mask }; + self.thread_mut(r).affinity_mask = new_mask; + // Current slot still allowed → no migration. + if effective & (1 << r.hw_id) != 0 { + return (old_mask, r, None); + } + // Pick target = least-depth allowed slot. + let target = self + .pick_least_depth_slot(effective) + .expect("set_affinity_ref: effective mask must allow some slot"); + // Physically move the GuestThread struct. + let src_len_before = self.slots[r.hw_id as usize].runqueue.len(); + let promoted_old_idx = (src_len_before - 1) as u16; + let mut thread = self.slots[r.hw_id as usize] + .runqueue + .swap_remove(r.idx as usize); + mem.write_pcr_id(thread.pcr_base, target); + // M3.7 — keep ctx.hw_id in sync with the thread's new slot so + // table-routed lwarx/stwcx use the correct discriminator. + thread.ctx.hw_id = target; + self.slots[target as usize].runqueue.push(thread); + let new_idx = (self.slots[target as usize].runqueue.len() - 1) as u16; + let new_ref = ThreadRef::new(target, new_idx); + // Timed waits: rewrite r → new_ref if present. + for entry in self.timed_waits.iter_mut() { + if entry.1 == r { + entry.1 = new_ref; + } else if entry.1 == ThreadRef::new(r.hw_id, promoted_old_idx) { + entry.1 = ThreadRef::new(r.hw_id, r.idx); + } + } + // Running index defense: if src slot's running_idx pointed at the + // migrated entry or the promoted peer, clear / retarget. + let src_slot = &mut self.slots[r.hw_id as usize]; + if src_slot.running_idx == Some(r.idx as usize) { + src_slot.running_idx = None; + } else if src_slot.running_idx == Some(promoted_old_idx as usize) { + src_slot.running_idx = Some(r.idx as usize); + } + self.recompute_slot_runnable(r.hw_id); + self.recompute_slot_runnable(target); + // If the migrating thread was the currently-running one (self- + // migrating export call), update `self.current` so `call_export`'s + // stashed ThreadRef still resolves on its swap-back path. + if self.current == Some(r) { + self.current = Some(new_ref); + } else if self.current == Some(ThreadRef::new(r.hw_id, promoted_old_idx)) + { + self.current = Some(ThreadRef::new(r.hw_id, r.idx)); + } + // Emit promotion info only if the last-index of the source + // wasn't the migrating thread itself (otherwise swap_remove was + // a plain pop and no peer got promoted). + let fixup = if promoted_old_idx != r.idx { + Some(MigrationFixup { + source_hw: r.hw_id, + promoted_old_idx, + promoted_new_idx: r.idx, + migrated_old_ref: r, + migrated_new_ref: new_ref, + }) + } else { + Some(MigrationFixup { + source_hw: r.hw_id, + promoted_old_idx: r.idx, // no-op promotion + promoted_new_idx: r.idx, + migrated_old_ref: r, + migrated_new_ref: new_ref, + }) + }; + (old_mask, new_ref, fixup) + } + + /// Mark the current thread exited. Returns (hw_id, tid, handle) of + /// the exiting thread so the caller can wake joiners. + pub fn exit_current(&mut self, exit_code: u32) -> (u8, Option, Option) { + let r = self.current.expect("exit_current outside step"); + let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize]; + let tid = Some(t.tid); + let handle = t.thread_handle; + t.state = HwState::Exited(exit_code); + self.timed_waits.retain(|&(_, tr)| tr != r); + self.recompute_slot_runnable(r.hw_id); + (r.hw_id, tid, handle) + } + + // ----- TLS ----- + + /// Allocate a new global TLS slot index. All live threads' `tls_values` + /// vecs grow to match. + pub fn tls_alloc(&mut self) -> u32 { + let idx = self.tls_slot_count as u32; + self.tls_slot_count += 1; + for slot in self.slots.iter_mut() { + for t in slot.runqueue.iter_mut() { + if t.tls_values.len() < self.tls_slot_count { + t.tls_values.resize(self.tls_slot_count, 0); + } + } + } + idx + } + + /// Compat: caller asks for a specific capacity (e.g. spawn's + /// `tls_slot_count`). Grows every thread's tls_values up to `count`. + pub fn tls_grow_to(&mut self, count: usize) { + if count > self.tls_slot_count { + self.tls_slot_count = count; + } + for slot in self.slots.iter_mut() { + for t in slot.runqueue.iter_mut() { + if t.tls_values.len() < count { + t.tls_values.resize(count, 0); + } + } + } + } + + pub fn tls_get(&self, slot_idx: u32) -> u64 { + let r = match self.current { + Some(r) => r, + None => return 0, + }; + self.slots[r.hw_id as usize].runqueue[r.idx as usize] + .tls_values + .get(slot_idx as usize) + .copied() + .unwrap_or(0) + } + + pub fn tls_set(&mut self, slot_idx: u32, value: u64) { + let Some(r) = self.current else { return; }; + let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize]; + let i = slot_idx as usize; + if t.tls_values.len() <= i { + t.tls_values.resize(i + 1, 0); + } + t.tls_values[i] = value; + } + + // ----- Time advance / deadlock ----- + + /// Peek the earliest pending timed-wait deadline without popping. The + /// kernel uses this together with `KernelState::earliest_timer_deadline` + /// to compute the next global time step in the scheduler round. + pub fn earliest_wait_deadline(&self) -> Option { + self.timed_waits.first().map(|&(d, _)| d) + } + + /// Move every thread's timebase up to `deadline` (if already past, + /// leave it alone). Extracted from the old `advance_to_next_wake` + /// body so the kernel can drive time-advance for timer fires in + /// addition to thread wakes. + pub fn advance_all_timebases_to(&mut self, deadline: u64) { + for slot in self.slots.iter_mut() { + for t in slot.runqueue.iter_mut() { + if t.ctx.timebase < deadline { + t.ctx.timebase = deadline; + } + } + } + } + + /// Fast-forward the timebase to the earliest pending timed wait and + /// wake that sleeper. Used when a round had no Ready threads and no + /// timer fires closer than the earliest wait. Returns the woken + /// thread's `ThreadRef` + the `BlockReason` it was parked with, so + /// the caller can stamp `STATUS_TIMEOUT` and scrub stale waiter-list + /// entries via `KernelState::handle_timeout_wake`. `None` means the + /// timed-waits queue was empty. + pub fn advance_to_next_wake(&mut self) -> Option<(ThreadRef, BlockReason)> { + let (deadline, r) = *self.timed_waits.first()?; + self.advance_all_timebases_to(deadline); + self.timed_waits.remove(0); + let t = &mut self.slots[r.hw_id as usize].runqueue[r.idx as usize]; + let reason = match std::mem::replace(&mut t.state, HwState::Ready) { + HwState::Blocked(reason) | HwState::ServicingIrq(reason) => reason, + other => { + // Defensive: the timed_waits entry should only ever track a + // Blocked or ServicingIrq thread, but if some path already + // woke this ref we keep going with a stand-in reason so the + // caller can't miss a timeout-wake follow-up. + tracing::debug!( + hw_id = r.hw_id, + idx = r.idx, + state = ?other, + "advance_to_next_wake: unexpected prior state (ignored)" + ); + BlockReason::Suspended + } + }; + t.quantum_remaining = QUANTUM_DEFAULT; + self.recompute_slot_runnable(r.hw_id); + tracing::info!( + "scheduler: advanced to deadline {} waking hw={} idx={}", + deadline, + r.hw_id, + r.idx + ); + Some((r, reason)) + } + + /// Pop the earliest timed wait only if its deadline is `<= target`. + /// Used by the kernel-driven scheduler loop to consume a just-ripe + /// thread wake after timers fired to that same `target`. If the + /// earliest entry has a later deadline (some other event drove + /// advance), returns `None` and leaves the entry in place. + pub fn advance_to_next_wake_if_due( + &mut self, + target: u64, + ) -> Option<(ThreadRef, BlockReason)> { + let (d, _) = *self.timed_waits.first()?; + if d > target { + return None; + } + self.advance_to_next_wake() + } + + /// Does any thread across any slot exist in a state other than + /// Exited/Idle? + pub fn has_live_thread(&self) -> bool { + self.slots.iter().any(|slot| { + slot.runqueue.iter().any(|t| { + matches!( + t.state, + HwState::Ready | HwState::Blocked(_) | HwState::ServicingIrq(_) + ) + }) + }) + } + + /// Snapshot thread states for diagnostic logging. One entry per live + /// guest thread (Exited are included so post-mortem can see exit codes). + pub fn diagnostic_snapshot(&self) -> Vec<(ThreadRef, Option, HwState)> { + let mut out = Vec::new(); + for (hw_id, slot) in self.slots.iter().enumerate() { + for (idx, t) in slot.runqueue.iter().enumerate() { + out.push(( + ThreadRef::new(hw_id as u8, idx as u16), + Some(t.tid), + t.state.clone(), + )); + } + } + out + } + + /// Force-wake every Blocked waiter (WaitAny/WaitAll/CriticalSection) + /// with STATUS_TIMEOUT. Caller writes the status code into + /// `ctx_mut_ref(r).gpr[3]`. Returns the refs that were woken. + pub fn unblock_on_deadlock(&mut self) -> Vec { + let mut woken = Vec::new(); + for (hw_id, slot) in self.slots.iter_mut().enumerate() { + for (idx, t) in slot.runqueue.iter_mut().enumerate() { + if matches!( + t.state, + HwState::Blocked(BlockReason::WaitAny { .. }) + | HwState::Blocked(BlockReason::WaitAll { .. }) + | HwState::Blocked(BlockReason::CriticalSection(_)) + ) { + t.state = HwState::Ready; + t.quantum_remaining = QUANTUM_DEFAULT; + woken.push(ThreadRef::new(hw_id as u8, idx as u16)); + } + } + } + self.timed_waits.clear(); + for i in 0..HW_THREAD_COUNT as u8 { + self.recompute_slot_runnable(i); + } + woken + } +} + +impl Default for Scheduler { + fn default() -> Self { + Self::new() + } +} + +// ====== Tests ====== + +#[cfg(test)] +mod tests { + use super::*; + + /// No-op PcrWriter for unit tests that don't exercise the guest memory write. + #[derive(Default)] + struct NullPcr; + impl PcrWriter for NullPcr { + fn write_pcr_id(&mut self, _pcr_base: u32, _hw_id: u8) {} + } + + /// PcrWriter that records every write for assertion. + #[derive(Default)] + struct RecordingPcr { + writes: Vec<(u32, u8)>, + } + impl PcrWriter for RecordingPcr { + fn write_pcr_id(&mut self, pcr_base: u32, hw_id: u8) { + self.writes.push((pcr_base, hw_id)); + } + } + + fn mk_scheduler_with_initial() -> Scheduler { + let mut s = Scheduler::new(); + let mut ctx = PpcContext::new(); + ctx.pc = 0x8200_0000; + ctx.gpr[1] = 0x7000_0000; + s.install_initial_thread( + ctx, + 0x7000_0000, + 0x10_0000, + 0x7FFF_0000, + 0x7FFE_0000, + 0x1000, + &mut NullPcr, + ); + s + } + + fn worker_spawn_params(tid: u32, handle: u32) -> SpawnParams { + SpawnParams { + entry: 0x8200_1000, + start_context: 0xDEAD_BEEF, + stack_base: 0x7100_0000 + tid * 0x10_0000, + stack_size: 0x10_0000, + pcr_base: 0x7FEF_0000 + tid * 0x2000, + tls_base: 0x7FEE_0000 + tid * 0x2000, + thread_handle: handle, + guest_tid: tid, + create_suspended: false, + is_initial: false, + tls_slot_count: 0, + affinity_mask: 0xFF, + priority: 0, + ideal_processor: None, + } + } + + // ---- preserved from pre-Axis-1 (updated names and params) ---- + + #[test] + fn spawn_lands_on_least_depth_slot() { + // With only slot 0 occupied, the next spawn must go to slot 1 + // (least depth among 1..5, all zero; 0 < 1). + let mut s = mk_scheduler_with_initial(); + let slot = s + .spawn(worker_spawn_params(2, 0x2000), &mut NullPcr) + .unwrap(); + assert_eq!(slot, 1); + let thread = &s.slots[1].runqueue[0]; + assert_eq!(thread.state, HwState::Ready); + assert_eq!(thread.ctx.pc, 0x8200_1000); + assert_eq!(thread.ctx.gpr[3], 0xDEAD_BEEF); + } + + #[test] + fn suspended_spawn_stays_blocked_until_resume() { + let mut s = mk_scheduler_with_initial(); + let mut params = worker_spawn_params(2, 0x2000); + params.create_suspended = true; + let slot = s.spawn(params, &mut NullPcr).unwrap(); + let r = ThreadRef::new(slot, 0); + assert_eq!( + s.thread(r).state, + HwState::Blocked(BlockReason::Suspended) + ); + assert_eq!(s.thread(r).suspend_count, 1); + let prev = s.resume_ref(r); + assert_eq!(prev, 1); + assert_eq!(s.thread(r).state, HwState::Ready); + } + + #[test] + fn round_schedule_skips_blocked() { + let mut s = mk_scheduler_with_initial(); + let mut params = worker_spawn_params(2, 0x2000); + params.create_suspended = true; + s.spawn(params, &mut NullPcr).unwrap(); + // Initial thread (slot 0) is Ready. Spawned thread (slot 1) is + // Suspended. round_schedule should only list slot 0. + let order = s.round_schedule(); + assert_eq!(order, vec![0]); + } + + #[test] + fn seeded_order_is_deterministic() { + let order = OrderMode::Seeded { seed: 42 }; + let mut s1 = mk_scheduler_with_initial(); + let mut s2 = mk_scheduler_with_initial(); + s1.order = order; + s1.rng_state = 42; + s2.order = order; + s2.rng_state = 42; + for i in 0..5 { + let tid = 2 + i as u32; + let _ = s1.spawn(worker_spawn_params(tid, 0x2000 + i * 4), &mut NullPcr); + let _ = s2.spawn(worker_spawn_params(tid, 0x2000 + i * 4), &mut NullPcr); + } + let a = s1.round_schedule(); + let b = s2.round_schedule(); + assert_eq!(a, b); + } + + #[test] + fn tls_is_per_thread() { + let mut s = mk_scheduler_with_initial(); + s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap(); + s.tls_grow_to(4); + // Simulate running on slot 0 (initial thread) + s.begin_slot_visit(0); + s.tls_set(0, 0xAAAA); + s.end_slot_visit(); + // Simulate running on slot 1 (worker) + s.begin_slot_visit(1); + s.tls_set(0, 0xBBBB); + s.end_slot_visit(); + s.begin_slot_visit(0); + assert_eq!(s.tls_get(0), 0xAAAA); + s.end_slot_visit(); + s.begin_slot_visit(1); + assert_eq!(s.tls_get(0), 0xBBBB); + } + + // ---- new Axis-1 tests ---- + + #[test] + fn test_two_threads_same_slot_higher_priority_runs_first() { + let mut s = mk_scheduler_with_initial(); + // Force both workers onto slot 0 via affinity. + let mut a = worker_spawn_params(2, 0x2000); + a.affinity_mask = 0b0000_0001; + a.priority = 0; + let mut b = worker_spawn_params(3, 0x3000); + b.affinity_mask = 0b0000_0001; + b.priority = 5; + s.spawn(a, &mut NullPcr).unwrap(); + s.spawn(b, &mut NullPcr).unwrap(); + // Slot 0 now holds: [main(pri 0), worker2(pri 0), worker3(pri 5)] + s.begin_slot_visit(0); + let r = s.current.expect("current set"); + let t = s.thread(r); + assert_eq!(t.tid, 3, "worker3 (pri 5) wins the pick"); + assert_eq!(t.priority, 5); + s.end_slot_visit(); + } + + #[test] + fn test_slot_depth_accounting_least_depth_placement() { + // Initial thread sits on slot 0 (depth 1, others 0). Spawning 6 + // more threads with affinity 0xFF should fill slots 1..5 each to + // depth 1, then the 7th lands on whichever slot has depth 1 + // (ties broken by lower index — min_by_key preserves the first + // minimum). + let mut s = mk_scheduler_with_initial(); + let mut placements = Vec::new(); + for i in 0..6 { + let tid = 2 + i as u32; + let slot = s + .spawn(worker_spawn_params(tid, 0x2000 + i * 4), &mut NullPcr) + .unwrap(); + placements.push(slot); + } + // After 7 total threads (1 initial + 6 workers), one of slots 1..5 + // carries 2. Since min_by_key picks the *first* minimum at each + // step and slot 0 starts at depth 1 (initial), placements should + // go: [1, 2, 3, 4, 5, 1] (slot 0 starts at depth 1, others at 0, + // so slot 1 wins first with depth 0; once slot 1 has one, slots + // 2..5 still have 0, so slot 2 next; etc. On the 6th worker all + // slots 1..5 have depth 1, same as slot 0 — min_by_key returns + // slot 0? No: we skip the "current depth" comparison... actually + // our filter includes slot 0 too since mask=0xFF. Slot 0 has + // depth 1, slots 1..5 each have depth 1 after the first 5 + // workers. The 6th worker sees slots 0..5 all with depth 1 ⇒ + // min_by_key returns slot 0 (lowest index). So placements = + // [1, 2, 3, 4, 5, 0]. + assert_eq!(placements, vec![1, 2, 3, 4, 5, 0]); + } + + #[test] + fn test_exited_threads_dont_block_spawn() { + let mut s = mk_scheduler_with_initial(); + // Fill slot 1 to the prune threshold with exited threads. + for i in 0..PRUNE_DEPTH_THRESHOLD { + let tid = 10 + i as u32; + let mut p = worker_spawn_params(tid, 0x4000 + i as u32 * 4); + p.affinity_mask = 0b0000_0010; // only slot 1 + s.spawn(p, &mut NullPcr).unwrap(); + } + assert_eq!(s.slots[1].runqueue.len(), PRUNE_DEPTH_THRESHOLD); + // Mark them all Exited. + for t in s.slots[1].runqueue.iter_mut() { + t.state = HwState::Exited(0); + } + // Now spawn a fresh thread with affinity = slot 1 only. Should + // land successfully (prune kicks in at PRUNE_DEPTH_THRESHOLD). + let mut p = worker_spawn_params(99, 0x9000); + p.affinity_mask = 0b0000_0010; + let slot = s.spawn(p, &mut NullPcr).unwrap(); + assert_eq!(slot, 1); + // Post-prune + push: all-Exited entries gone, fresh thread at idx 0. + assert_eq!(s.slots[1].runqueue.len(), 1); + assert_eq!(s.slots[1].runqueue[0].tid, 99); + } + + #[test] + fn test_threadref_survives_spawn() { + // Peer spawned into the same slot must not shift an existing + // ThreadRef (vec push appends, doesn't reorder). + let mut s = mk_scheduler_with_initial(); + let mut a = worker_spawn_params(2, 0x2000); + a.affinity_mask = 0b0000_0010; // slot 1 + s.spawn(a, &mut NullPcr).unwrap(); + let r_original = ThreadRef { hw_id: 1, idx: 0, generation: 0 }; + assert_eq!(s.thread(r_original).tid, 2); + + let mut b = worker_spawn_params(3, 0x3000); + b.affinity_mask = 0b0000_0010; + s.spawn(b, &mut NullPcr).unwrap(); + // Original ref still resolves to tid 2. + assert_eq!(s.thread(r_original).tid, 2); + assert_eq!(s.slots[1].runqueue[1].tid, 3); + } + + #[test] + fn test_priority_default_zero() { + let mut s = mk_scheduler_with_initial(); + let slot = s + .spawn(worker_spawn_params(2, 0x2000), &mut NullPcr) + .unwrap(); + let r = ThreadRef::new(slot, 0); + assert_eq!(s.priority_ref(r), 0); + let prev = s.set_priority_ref(r, 5); + assert_eq!(prev, 0); + assert_eq!(s.priority_ref(r), 5); + } + + #[test] + fn test_spawn_records_pcr_write() { + let mut s = mk_scheduler_with_initial(); + let mut rec = RecordingPcr::default(); + // install_initial wrote (pcr_base=0x7FFF_0000, hw=0) + // spawn will write (pcr_base=0x7FEF_0000 + delta, hw=1) + let p = worker_spawn_params(2, 0x2000); + let pcr_base = p.pcr_base; + let slot = s.spawn(p, &mut rec).unwrap(); + assert_eq!(rec.writes, vec![(pcr_base, slot)]); + } + + #[test] + fn test_find_by_tid_returns_threadref() { + let mut s = mk_scheduler_with_initial(); + s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap(); + let r = s.find_by_tid(2).expect("spawned tid 2"); + assert_eq!(r, ThreadRef { hw_id: 1, idx: 0, generation: 0 }); + assert!(s.find_by_tid(99).is_none()); + } + + #[test] + fn test_find_by_handle_returns_threadref() { + let mut s = mk_scheduler_with_initial(); + s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap(); + let r = s.find_by_handle(0x2000).expect("handle 0x2000"); + assert_eq!(r, ThreadRef { hw_id: 1, idx: 0, generation: 0 }); + } + + #[test] + fn test_exit_current_marks_state_without_removal() { + // Exit must NOT Vec::remove — that would invalidate peer + // ThreadRefs. State flip + stable positions is the invariant. + let mut s = mk_scheduler_with_initial(); + s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap(); + s.begin_slot_visit(0); + let r = s.current.expect("current set"); + let (hw_id, tid, _handle) = s.exit_current(0xABCD); + s.end_slot_visit(); + assert_eq!(hw_id, 0); + assert_eq!(tid, Some(INITIAL_GUEST_TID)); + // Thread still at slot 0 idx 0, now Exited. + assert_eq!(s.slots[0].runqueue.len(), 1); + assert_eq!(s.slots[0].runqueue[0].state, HwState::Exited(0xABCD)); + // worker on slot 1 idx 0 is unaffected. + assert_eq!(s.slots[1].runqueue[0].tid, 2); + let _ = r; + } + + // ---- Axis 2: rotation + bitset tests ---- + + fn mk_empty_scheduler() -> Scheduler { + // For rotation tests we want NO initial thread on slot 0 — + // every runnable bit comes from explicit spawns below. + Scheduler::new() + } + + #[test] + fn test_rotation_cursor_advances_per_round() { + let mut s = mk_empty_scheduler(); + // Populate all 6 slots with one Ready thread each. + let mut next_tid = 1u32; + for hw in 0..6u8 { + let mut p = SpawnParams::default(); + p.guest_tid = next_tid; + p.thread_handle = 0x1000 + (next_tid * 4); + p.affinity_mask = 1 << hw; + p.pcr_base = 0x40000000 + (hw as u32) * 0x1000; + s.spawn(p, &mut NullPcr).unwrap(); + next_tid += 1; + } + assert_eq!(s.non_empty_runnable, 0b11_1111); + let r1 = s.round_schedule(); + assert_eq!(r1, vec![0, 1, 2, 3, 4, 5]); + let r2 = s.round_schedule(); + assert_eq!(r2, vec![1, 2, 3, 4, 5, 0]); + let r3 = s.round_schedule(); + assert_eq!(r3, vec![2, 3, 4, 5, 0, 1]); + } + + #[test] + fn test_rotation_skips_empty_slots() { + let mut s = mk_empty_scheduler(); + // Slots [Ready, Ready, empty, Ready, Ready, empty] ⇒ bitset 0b011011. + for hw in [0u8, 1, 3, 4] { + let mut p = SpawnParams::default(); + p.guest_tid = (hw + 1) as u32; + p.thread_handle = 0x1000 + (hw as u32) * 4; + p.affinity_mask = 1 << hw; + p.pcr_base = 0x40000000 + (hw as u32) * 0x1000; + s.spawn(p, &mut NullPcr).unwrap(); + } + assert_eq!(s.non_empty_runnable, 0b01_1011); + let r = s.round_schedule(); + assert_eq!(r, vec![0, 1, 3, 4], "emits only slots with bits set"); + let r = s.round_schedule(); + assert_eq!(r, vec![1, 3, 4, 0], "rotation cursor advances past empties"); + } + + #[test] + fn test_park_toggles_bit_and_wake_restores() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 2; + p.thread_handle = 0x2000; + p.affinity_mask = 0b0010; + p.pcr_base = 0x4000_1000; + s.spawn(p, &mut NullPcr).unwrap(); + assert_eq!(s.non_empty_runnable, 0b0010); + // Park the thread: bit 1 should clear. + s.begin_slot_visit(1); + s.park_current(BlockReason::DelayUntil(1_000_000)); + s.end_slot_visit(); + assert_eq!(s.non_empty_runnable, 0, "park clears slot 1's runnable bit"); + // Wake it: bit 1 restores. + let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 }; + s.wake_ref(r); + assert_eq!(s.non_empty_runnable, 0b0010); + } + + #[test] + fn test_round_schedule_empty_fastpath() { + let mut s = mk_empty_scheduler(); + // No spawns ⇒ bitset is 0 ⇒ fast return without allocating. + assert_eq!(s.non_empty_runnable, 0); + let r = s.round_schedule(); + assert!(r.is_empty()); + // Cursor must not advance on empty rounds (nothing happened). + assert_eq!(s.rotation_cursor, 0); + } + + #[test] + fn test_rotation_fairness_three_slots_two_threads_each() { + let mut s = mk_empty_scheduler(); + // Slots 0, 2, 4 each hold two Ready threads; 1/3/5 empty. + let mut next_tid = 1u32; + for hw in [0u8, 2, 4] { + for _slot_peer in 0..2 { + let mut p = SpawnParams::default(); + p.guest_tid = next_tid; + p.thread_handle = 0x1000 + (next_tid * 4); + p.affinity_mask = 1 << hw; + p.pcr_base = 0x40000000 + (next_tid * 0x1000); + s.spawn(p, &mut NullPcr).unwrap(); + next_tid += 1; + } + } + assert_eq!(s.non_empty_runnable, 0b01_0101); + let r = s.round_schedule(); + // Three entries per round (one per non-empty slot). + assert_eq!(r.len(), 3); + assert!(r.contains(&0) && r.contains(&2) && r.contains(&4)); + } + + // ---- Axis 5: ideal processor + initial placement tests ---- + + #[test] + fn test_spawn_with_ideal_processor_lands_on_ideal_slot() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 1; + p.thread_handle = 0x1000; + p.affinity_mask = 0xFF; + p.ideal_processor = Some(3); + p.pcr_base = 0x4000_0000; + let slot = s.spawn(p, &mut NullPcr).unwrap(); + assert_eq!(slot, 3, "ideal=3 + mask=0xFF lands on slot 3"); + } + + #[test] + fn test_spawn_with_ideal_outside_mask_falls_back_to_least_depth() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 1; + p.thread_handle = 0x1000; + p.affinity_mask = 0b0000_0011; // only slots 0, 1 + p.ideal_processor = Some(5); // outside mask + p.pcr_base = 0x4000_0000; + let slot = s.spawn(p, &mut NullPcr).unwrap(); + assert!(slot == 0 || slot == 1, "falls back to mask-allowed least-depth"); + } + + #[test] + fn test_spawn_without_ideal_uses_least_depth() { + let mut s = mk_empty_scheduler(); + // Pre-fill slots 0..3 with one thread each via explicit affinity. + let mut next_tid = 1u32; + for hw in 0..4u8 { + let mut p = SpawnParams::default(); + p.guest_tid = next_tid; + p.thread_handle = 0x1000 + next_tid * 4; + p.affinity_mask = 1 << hw; + p.pcr_base = 0x4000_0000 + next_tid * 0x1000; + s.spawn(p, &mut NullPcr).unwrap(); + next_tid += 1; + } + // Slots 0..3 have depth 1; 4, 5 have depth 0. + let mut p = SpawnParams::default(); + p.guest_tid = next_tid; + p.thread_handle = 0x1000 + next_tid * 4; + p.affinity_mask = 0xFF; + p.ideal_processor = None; + p.pcr_base = 0x4000_0000 + next_tid * 0x1000; + let slot = s.spawn(p, &mut NullPcr).unwrap(); + assert!(slot == 4 || slot == 5, "least-depth wins; slot={}", slot); + } + + #[test] + fn test_set_ideal_ref_roundtrip() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 1; + p.thread_handle = 0x1000; + p.affinity_mask = 0b0000_0001; + p.pcr_base = 0x4000_0000; + s.spawn(p, &mut NullPcr).unwrap(); + let r = ThreadRef { hw_id: 0, idx: 0, generation: 0 }; + assert_eq!(s.ideal_ref(r), None, "no ideal at spawn"); + let prev = s.set_ideal_ref(r, 4); + assert_eq!(prev, 0xFF, "unset previous returns 0xFF sentinel"); + assert_eq!(s.ideal_ref(r), Some(4)); + let prev = s.set_ideal_ref(r, 2); + assert_eq!(prev, 4); + assert_eq!(s.ideal_ref(r), Some(2)); + } + + // ---- Axis 4: affinity migration tests ---- + + #[test] + fn test_affinity_change_migrates_to_new_slot() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 1; + p.thread_handle = 0x1000; + p.affinity_mask = 0xFF; + p.pcr_base = 0x4000_0000; + s.spawn(p, &mut NullPcr).unwrap(); + // Landed on slot 0 (least-depth + lowest-index tiebreak). + assert_eq!(s.slots[0].runqueue.len(), 1); + let r = ThreadRef { hw_id: 0, idx: 0, generation: 0 }; + // Restrict to slot 2 only. + let (old, new_ref, _fx) = s.set_affinity_ref(r, 0b0000_0100, &mut NullPcr); + assert_eq!(old, 0xFF); + assert_eq!(new_ref, ThreadRef { hw_id: 2, idx: 0, generation: 0 }); + assert!(s.slots[0].runqueue.is_empty()); + assert_eq!(s.slots[2].runqueue.len(), 1); + } + + #[test] + fn test_affinity_change_stays_put_when_current_allowed() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 1; + p.thread_handle = 0x1000; + p.affinity_mask = 0b0000_1000; + p.pcr_base = 0x4000_0000; + s.spawn(p, &mut NullPcr).unwrap(); + // Landed on slot 3 (only bit set). + let r = ThreadRef { hw_id: 3, idx: 0, generation: 0 }; + assert_eq!(s.thread(r).tid, 1); + // Expand mask to 0..3 — slot 3 still allowed, no migration. + let (_old, new_ref, _fx) = s.set_affinity_ref(r, 0b0000_1111, &mut NullPcr); + assert_eq!(new_ref, r); + } + + #[test] + fn test_affinity_migration_rewrites_pcr() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 1; + p.thread_handle = 0x1000; + p.affinity_mask = 0b0000_0010; + p.pcr_base = 0x4100_0000; + s.spawn(p, &mut NullPcr).unwrap(); + let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 }; + let mut rec = RecordingPcr::default(); + let (_old, _new, _fx) = s.set_affinity_ref(r, 0b0001_0000, &mut rec); + // Migration target = slot 4 (the only bit set). + assert_eq!(rec.writes, vec![(0x4100_0000, 4)]); + } + + #[test] + fn test_affinity_mask_zero_treated_as_any() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 1; + p.thread_handle = 0x1000; + p.affinity_mask = 0b0000_0100; + p.pcr_base = 0x4000_0000; + s.spawn(p, &mut NullPcr).unwrap(); + let r = ThreadRef { hw_id: 2, idx: 0, generation: 0 }; + // mask=0 normalizes to 0xFF; slot 2 is still allowed → no migration. + let (old, new_ref, _fx) = s.set_affinity_ref(r, 0, &mut NullPcr); + assert_eq!(old, 0b0000_0100); + assert_eq!(new_ref, r); + // Verify the stored mask is 0 (we save the raw value) even though + // the effective is 0xFF. + assert_eq!(s.thread(r).affinity_mask, 0); + } + + #[test] + fn test_affinity_migration_fixup_retargets_promoted_peer() { + // Two threads on slot 0: A (idx 0), B (idx 1). Migrate A to + // slot 3 — swap_remove moves B from idx 1 to idx 0. A ref that + // previously pointed at B (idx 1) must be retargeted to idx 0. + let mut s = mk_empty_scheduler(); + for tid in [1u32, 2] { + let mut p = SpawnParams::default(); + p.guest_tid = tid; + p.thread_handle = 0x1000 + tid * 4; + p.affinity_mask = 0b0000_0001; + p.pcr_base = 0x4000_0000 + tid * 0x1000; + s.spawn(p, &mut NullPcr).unwrap(); + } + let a_ref = ThreadRef { hw_id: 0, idx: 0, generation: 0 }; + let b_ref_before = ThreadRef { hw_id: 0, idx: 1, generation: 0 }; + assert_eq!(s.thread(b_ref_before).tid, 2); + let (_old, a_new, fx) = s.set_affinity_ref(a_ref, 0b0000_1000, &mut NullPcr); + let fx = fx.expect("migration emits fixup"); + // A now lives on slot 3 idx 0. + assert_eq!(a_new, ThreadRef { hw_id: 3, idx: 0, generation: 0 }); + // Apply fixup to B's old ref → promoted into slot 0 idx 0. + let mut stale = b_ref_before; + fx.apply(&mut stale); + assert_eq!(stale, ThreadRef { hw_id: 0, idx: 0, generation: 0 }); + assert_eq!(s.thread(stale).tid, 2); + } + // ---- Axis 3: quantum tests ---- + + #[test] + fn test_quantum_rotation_within_slot() { + let mut s = mk_empty_scheduler(); + // A and B both on slot 0 at priority 0. + for tid in [1u32, 2] { + let mut p = SpawnParams::default(); + p.guest_tid = tid; + p.thread_handle = 0x1000 + tid * 4; + p.affinity_mask = 0b0001; + p.pcr_base = 0x4000_0000 + tid * 0x1000; + s.spawn(p, &mut NullPcr).unwrap(); + } + s.begin_slot_visit(0); + let first_tid = s.thread(s.current.unwrap()).tid; + // Drain the quantum. Each call bar the last returns false. + for _ in 0..(QUANTUM_DEFAULT - 1) { + assert!(!s.decrement_quantum()); + } + // The final decrement flips running_idx to the peer. + assert!(s.decrement_quantum()); + let second_tid = s.thread(s.current.unwrap()).tid; + assert_ne!(first_tid, second_tid, "quantum expiry rotates to peer"); + } + + #[test] + fn test_quantum_does_not_rotate_without_same_priority_peer() { + let mut s = mk_empty_scheduler(); + // A priority 0, B priority 5 — B wins pick_runnable outright, so + // quantum expiry on A shouldn't flip to B (priority ordering + // handles that next round instead). + let mut pa = SpawnParams::default(); + pa.guest_tid = 1; + pa.thread_handle = 0x1000; + pa.affinity_mask = 0b0001; + pa.pcr_base = 0x4000_0000; + pa.priority = 0; + s.spawn(pa, &mut NullPcr).unwrap(); + let mut pb = SpawnParams::default(); + pb.guest_tid = 2; + pb.thread_handle = 0x1004; + pb.affinity_mask = 0b0001; + pb.pcr_base = 0x4000_1000; + pb.priority = 5; + s.spawn(pb, &mut NullPcr).unwrap(); + // Force A to be running (pick_runnable would actually pick B; + // drive the test by manually setting current). + s.begin_slot_visit(0); + // pick_runnable selects the priority-5 thread (tid=2) because max_by_key + // returns highest priority. Set running to A (tid=1, idx=0) manually. + s.slots[0].running_idx = Some(0); + s.current = Some(ThreadRef { hw_id: 0, idx: 0, generation: 0 }); + // Drain A's quantum; should reload to DEFAULT but not rotate + // (B is higher priority, not equal). + for _ in 0..QUANTUM_DEFAULT { + let _ = s.decrement_quantum(); + } + let t = s.thread(s.current.unwrap()); + assert_eq!(t.tid, 1, "stays on A; B has higher priority, not equal"); + assert_eq!(t.quantum_remaining, QUANTUM_DEFAULT, "quantum reloaded"); + } + + #[test] + fn test_cooperative_yield_does_not_need_quantum() { + let mut s = mk_empty_scheduler(); + for tid in [1u32, 2] { + let mut p = SpawnParams::default(); + p.guest_tid = tid; + p.thread_handle = 0x1000 + tid * 4; + p.affinity_mask = 0b0001; + p.pcr_base = 0x4000_0000 + tid * 0x1000; + s.spawn(p, &mut NullPcr).unwrap(); + } + s.begin_slot_visit(0); + let first_tid = s.thread(s.current.unwrap()).tid; + // Park via cooperative yield. + s.park_current(BlockReason::DelayUntil(1_000_000)); + s.end_slot_visit(); + // Next round: pick_runnable skips the Blocked one, so the other + // thread is selected. + s.begin_slot_visit(0); + let next_tid = s.thread(s.current.unwrap()).tid; + assert_ne!(first_tid, next_tid, "cooperative park switches thread"); + } + + #[test] + fn test_wake_ref_resets_quantum() { + let mut s = mk_empty_scheduler(); + let mut p = SpawnParams::default(); + p.guest_tid = 2; + p.thread_handle = 0x2000; + p.affinity_mask = 0b0010; + p.pcr_base = 0x4000_1000; + s.spawn(p, &mut NullPcr).unwrap(); + let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 }; + // Park, poke quantum to 1, wake ⇒ quantum back to DEFAULT. + s.thread_mut(r).state = HwState::Blocked(BlockReason::WaitAny { + handles: vec![0xDEAD], + deadline: None, + }); + s.thread_mut(r).quantum_remaining = 1; + s.wake_ref(r); + assert_eq!(s.thread(r).quantum_remaining, QUANTUM_DEFAULT); + } + + #[test] + fn test_wake_ref_restores_ready_and_quantum() { + let mut s = mk_scheduler_with_initial(); + s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap(); + let r = ThreadRef { hw_id: 1, idx: 0, generation: 0 }; + // Park then wake. + s.thread_mut(r).state = HwState::Blocked(BlockReason::WaitAny { + handles: vec![0x1234], + deadline: None, + }); + s.thread_mut(r).quantum_remaining = 1; + s.wake_ref(r); + assert_eq!(s.thread(r).state, HwState::Ready); + assert_eq!(s.thread(r).quantum_remaining, QUANTUM_DEFAULT); + } +} diff --git a/crates/xenia-cpu/src/trap.rs b/crates/xenia-cpu/src/trap.rs new file mode 100644 index 0000000..5889b71 --- /dev/null +++ b/crates/xenia-cpu/src/trap.rs @@ -0,0 +1,95 @@ +//! TO-field evaluation for `tw`, `twi`, `td`, `tdi`. +//! +//! The TO field (5 bits) encodes which comparison outcomes trigger a trap: +//! +//! | bit | condition | +//! |-----|-----------| +//! | 0 | a < b (signed) | +//! | 1 | a > b (signed) | +//! | 2 | a == b | +//! | 3 | a < b (unsigned) | +//! | 4 | a > b (unsigned) | +//! +//! The bit numbering matches PowerISA ("MSB is bit 0"): TO[0] corresponds to +//! the high bit of the 5-bit field, i.e. (to >> 4) & 1. +//! +//! `tw` / `twi` compare the low 32 bits of the operands (sign-extended back to +//! 64 for the signed comparison); `td` / `tdi` compare the full 64 bits. + +#[derive(Clone, Copy, Debug)] +pub enum TrapWidth { + Word, // tw, twi: 32-bit + Doubleword, // td, tdi: 64-bit +} + +const TO_SLT: u32 = 1 << 4; // a < b signed +const TO_SGT: u32 = 1 << 3; // a > b signed +const TO_EQ: u32 = 1 << 2; // a == b +const TO_ULT: u32 = 1 << 1; // a < b unsigned +const TO_UGT: u32 = 1 << 0; // a > b unsigned + +/// Returns true when the trap should fire. +pub fn evaluate(to: u32, a: u64, b: u64, width: TrapWidth) -> bool { + let (sa, sb, ua, ub): (i64, i64, u64, u64) = match width { + TrapWidth::Word => ( + a as i32 as i64, + b as i32 as i64, + a as u32 as u64, + b as u32 as u64, + ), + TrapWidth::Doubleword => (a as i64, b as i64, a, b), + }; + + if (to & TO_SLT) != 0 && sa < sb { return true; } + if (to & TO_SGT) != 0 && sa > sb { return true; } + if (to & TO_EQ) != 0 && ua == ub { return true; } + if (to & TO_ULT) != 0 && ua < ub { return true; } + if (to & TO_UGT) != 0 && ua > ub { return true; } + false +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn to_zero_never_traps() { + assert!(!evaluate(0, 0, 0, TrapWidth::Doubleword)); + assert!(!evaluate(0, 5, 3, TrapWidth::Doubleword)); + assert!(!evaluate(0, !0, 0, TrapWidth::Doubleword)); + } + + #[test] + fn to_31_always_traps_when_any_condition_holds() { + // 31 = 0b11111 = all conditions enabled + assert!(evaluate(31, 1, 2, TrapWidth::Doubleword)); // slt+ult + assert!(evaluate(31, 2, 1, TrapWidth::Doubleword)); // sgt+ugt + assert!(evaluate(31, 7, 7, TrapWidth::Doubleword)); // eq + } + + #[test] + fn to_eq_only() { + // TO[2] = 0b00100 = 4 + assert!(evaluate(4, 5, 5, TrapWidth::Doubleword)); + assert!(!evaluate(4, 5, 6, TrapWidth::Doubleword)); + } + + #[test] + fn to_signed_vs_unsigned_on_negative() { + // a=-1 (as u64 = all-ones). TO[0]=slt enabled = 0b10000 = 16 + // Signed: -1 < 0 → true + let neg1 = (-1i64) as u64; + assert!(evaluate(16, neg1, 0, TrapWidth::Doubleword)); + // TO[3]=ult enabled = 0b00010 = 2 → unsigned: all-ones < 0 is false + assert!(!evaluate(2, neg1, 0, TrapWidth::Doubleword)); + } + + #[test] + fn word_width_ignores_high_32_bits() { + // a's low 32 = 1, high 32 = different; b = 1. With TO=eq, should trap. + let a = 0xDEAD_BEEF_0000_0001u64; + assert!(evaluate(4, a, 1, TrapWidth::Word)); + // In doubleword, different. + assert!(!evaluate(4, a, 1, TrapWidth::Doubleword)); + } +} diff --git a/crates/xenia-cpu/src/vmx.rs b/crates/xenia-cpu/src/vmx.rs new file mode 100644 index 0000000..5650a34 --- /dev/null +++ b/crates/xenia-cpu/src/vmx.rs @@ -0,0 +1,918 @@ +//! VMX / AltiVec helper routines shared by the interpreter's 150+ vector +//! opcode handlers. +//! +//! Big-endian lane indexing throughout: `Vec128::bytes[0]` is the most +//! significant byte, which corresponds to PowerPC lane 0. Operations that +//! care about "even" vs "odd" lanes follow the PPC convention (lane 0 = most +//! significant = "even" for multiply-even/odd purposes). + +use xenia_memory::MemoryAccess; +use xenia_types::Vec128; + +// ─── Lane accessors ──────────────────────────────────────────────────────── + +#[inline] pub fn as_i8x16(v: Vec128) -> [i8; 16] { + let b = v.as_bytes(); + let mut r = [0i8; 16]; + for i in 0..16 { r[i] = b[i] as i8; } + r +} + +#[inline] pub fn as_i16x8(v: Vec128) -> [i16; 8] { + let u = v.as_u16x8(); + [u[0] as i16, u[1] as i16, u[2] as i16, u[3] as i16, + u[4] as i16, u[5] as i16, u[6] as i16, u[7] as i16] +} + +#[inline] pub fn as_i32x4(v: Vec128) -> [i32; 4] { + let u = v.as_u32x4(); + [u[0] as i32, u[1] as i32, u[2] as i32, u[3] as i32] +} + +#[inline] pub fn from_i8x16(r: [i8; 16]) -> Vec128 { + let mut b = [0u8; 16]; + for i in 0..16 { b[i] = r[i] as u8; } + Vec128::from_bytes(b) +} + +#[inline] pub fn from_i16x8(r: [i16; 8]) -> Vec128 { + Vec128::from_u16x8_array([ + r[0] as u16, r[1] as u16, r[2] as u16, r[3] as u16, + r[4] as u16, r[5] as u16, r[6] as u16, r[7] as u16, + ]) +} + +#[inline] pub fn from_i32x4(r: [i32; 4]) -> Vec128 { + Vec128::from_u32x4_array([r[0] as u32, r[1] as u32, r[2] as u32, r[3] as u32]) +} + +// ─── Saturation helpers ──────────────────────────────────────────────────── +// Each returns (clamped_value, saturated_flag). Handlers OR the flags together +// and call `ctx.set_vscr_sat(true)` once per instruction. + +#[inline] pub fn sat_add_u8(a: u8, b: u8) -> (u8, bool) { + let s = a as u16 + b as u16; + if s > u8::MAX as u16 { (u8::MAX, true) } else { (s as u8, false) } +} +#[inline] pub fn sat_sub_u8(a: u8, b: u8) -> (u8, bool) { + if a >= b { (a - b, false) } else { (0, true) } +} +#[inline] pub fn sat_add_i8(a: i8, b: i8) -> (i8, bool) { + let s = a as i16 + b as i16; + if s > i8::MAX as i16 { (i8::MAX, true) } + else if s < i8::MIN as i16 { (i8::MIN, true) } + else { (s as i8, false) } +} +#[inline] pub fn sat_sub_i8(a: i8, b: i8) -> (i8, bool) { + let s = a as i16 - b as i16; + if s > i8::MAX as i16 { (i8::MAX, true) } + else if s < i8::MIN as i16 { (i8::MIN, true) } + else { (s as i8, false) } +} + +#[inline] pub fn sat_add_u16(a: u16, b: u16) -> (u16, bool) { + let s = a as u32 + b as u32; + if s > u16::MAX as u32 { (u16::MAX, true) } else { (s as u16, false) } +} +#[inline] pub fn sat_sub_u16(a: u16, b: u16) -> (u16, bool) { + if a >= b { (a - b, false) } else { (0, true) } +} +#[inline] pub fn sat_add_i16(a: i16, b: i16) -> (i16, bool) { + let s = a as i32 + b as i32; + if s > i16::MAX as i32 { (i16::MAX, true) } + else if s < i16::MIN as i32 { (i16::MIN, true) } + else { (s as i16, false) } +} +#[inline] pub fn sat_sub_i16(a: i16, b: i16) -> (i16, bool) { + let s = a as i32 - b as i32; + if s > i16::MAX as i32 { (i16::MAX, true) } + else if s < i16::MIN as i32 { (i16::MIN, true) } + else { (s as i16, false) } +} + +#[inline] pub fn sat_add_u32(a: u32, b: u32) -> (u32, bool) { + let s = a as u64 + b as u64; + if s > u32::MAX as u64 { (u32::MAX, true) } else { (s as u32, false) } +} +#[inline] pub fn sat_sub_u32(a: u32, b: u32) -> (u32, bool) { + if a >= b { (a - b, false) } else { (0, true) } +} +#[inline] pub fn sat_add_i32(a: i32, b: i32) -> (i32, bool) { + let s = a as i64 + b as i64; + if s > i32::MAX as i64 { (i32::MAX, true) } + else if s < i32::MIN as i64 { (i32::MIN, true) } + else { (s as i32, false) } +} +#[inline] pub fn sat_sub_i32(a: i32, b: i32) -> (i32, bool) { + let s = a as i64 - b as i64; + if s > i32::MAX as i64 { (i32::MAX, true) } + else if s < i32::MIN as i64 { (i32::MIN, true) } + else { (s as i32, false) } +} + +// Pack-with-saturation helpers — clamp a wider integer to the narrower type. +#[inline] pub fn sat_i16_to_i8(v: i16) -> (i8, bool) { + if v > i8::MAX as i16 { (i8::MAX, true) } + else if v < i8::MIN as i16 { (i8::MIN, true) } + else { (v as i8, false) } +} +#[inline] pub fn sat_i16_to_u8(v: i16) -> (u8, bool) { + if v < 0 { (0, true) } + else if v > u8::MAX as i16 { (u8::MAX, true) } + else { (v as u8, false) } +} +#[inline] pub fn sat_u16_to_u8(v: u16) -> (u8, bool) { + if v > u8::MAX as u16 { (u8::MAX, true) } else { (v as u8, false) } +} +#[inline] pub fn sat_i32_to_i16(v: i32) -> (i16, bool) { + if v > i16::MAX as i32 { (i16::MAX, true) } + else if v < i16::MIN as i32 { (i16::MIN, true) } + else { (v as i16, false) } +} +#[inline] pub fn sat_i32_to_u16(v: i32) -> (u16, bool) { + if v < 0 { (0, true) } + else if v > u16::MAX as i32 { (u16::MAX, true) } + else { (v as u16, false) } +} +#[inline] pub fn sat_u32_to_u16(v: u32) -> (u16, bool) { + if v > u16::MAX as u32 { (u16::MAX, true) } else { (v as u16, false) } +} +#[inline] pub fn sat_i64_to_i32(v: i64) -> (i32, bool) { + if v > i32::MAX as i64 { (i32::MAX, true) } + else if v < i32::MIN as i64 { (i32::MIN, true) } + else { (v as i32, false) } +} +#[inline] pub fn sat_i64_to_u32(v: i64) -> (u32, bool) { + if v < 0 { (0, true) } + else if v > u32::MAX as i64 { (u32::MAX, true) } + else { (v as u32, false) } +} + +// ─── Averages ────────────────────────────────────────────────────────────── +// PPC avg is rounded up: (a + b + 1) / 2. +#[inline] pub fn avg_u8(a: u8, b: u8) -> u8 { + ((a as u16 + b as u16 + 1) >> 1) as u8 +} +#[inline] pub fn avg_u16(a: u16, b: u16) -> u16 { + ((a as u32 + b as u32 + 1) >> 1) as u16 +} +#[inline] pub fn avg_u32(a: u32, b: u32) -> u32 { + ((a as u64 + b as u64 + 1) >> 1) as u32 +} +#[inline] pub fn avg_i8(a: i8, b: i8) -> i8 { + ((a as i32 + b as i32 + 1) >> 1) as i8 +} +#[inline] pub fn avg_i16(a: i16, b: i16) -> i16 { + ((a as i32 + b as i32 + 1) >> 1) as i16 +} +#[inline] pub fn avg_i32(a: i32, b: i32) -> i32 { + ((a as i64 + b as i64 + 1) >> 1) as i32 +} + +// ─── NaN-aware f32 min/max for vmaxfp / vminfp ──────────────────────────── +// +// Altivec PEM: "If either element of vA or vB is a NaN, the corresponding +// element of vD is set to the quiet NaN form of that NaN". Rust's `>` / `<` +// comparison with NaN always returns false, so `if a > b { a } else { b }` +// would silently pick `b` whenever `a` is NaN — losing NaN propagation. + +#[inline] +pub fn max_nan(a: f32, b: f32) -> f32 { + if a.is_nan() { quiet_nan(a) } + else if b.is_nan() { quiet_nan(b) } + else if a > b { a } else { b } +} + +#[inline] +pub fn min_nan(a: f32, b: f32) -> f32 { + if a.is_nan() { quiet_nan(a) } + else if b.is_nan() { quiet_nan(b) } + else if a < b { a } else { b } +} + +/// Convert an SNaN to QNaN by setting the high mantissa bit. A QNaN is +/// returned unchanged. +#[inline] +pub fn quiet_nan(x: f32) -> f32 { + if !x.is_nan() { return x; } + f32::from_bits(x.to_bits() | 0x0040_0000) +} + +/// Flush a subnormal f32 to ±0 (preserving the sign). Used by vmaddfp family, +/// vctsxs / vctuxs, and any instruction whose AltiVec definition specifies +/// input-side denormal flushing regardless of VSCR[NJ]. +#[inline] +pub fn flush_denorm(x: f32) -> f32 { + if x.is_subnormal() { + if x.is_sign_negative() { -0.0 } else { 0.0 } + } else { + x + } +} + +// ─── Float ⇄ fixed-point conversions (scaled by 2^scale_bits) ───────────── +// +// vctsxs / vctuxs flush denormal inputs to 0 before scaling, per Altivec. +#[inline] pub fn cvt_f32_to_i32_sat(x: f32, scale_bits: u32) -> (i32, bool) { + if x.is_nan() { return (0, true); } + let x = flush_denorm(x); + let scaled = (x as f64) * ((1u64 << scale_bits) as f64); + if scaled >= i32::MAX as f64 { return (i32::MAX, true); } + if scaled <= i32::MIN as f64 { return (i32::MIN, true); } + (scaled.trunc() as i32, false) +} +#[inline] pub fn cvt_f32_to_u32_sat(x: f32, scale_bits: u32) -> (u32, bool) { + if x.is_nan() { return (0, true); } + let x = flush_denorm(x); + let scaled = (x as f64) * ((1u64 << scale_bits) as f64); + if scaled < 0.0 { return (0, true); } + if scaled > u32::MAX as f64 { return (u32::MAX, true); } + (scaled.trunc() as u32, false) +} +#[inline] pub fn cvt_i32_to_f32(v: i32, scale_bits: u32) -> f32 { + (v as f64 / (1u64 << scale_bits) as f64) as f32 +} +#[inline] pub fn cvt_u32_to_f32(v: u32, scale_bits: u32) -> f32 { + (v as f64 / (1u64 << scale_bits) as f64) as f32 +} + +// ─── Unaligned vector load/store ────────────────────────────────────────── +// +// lvlx/lvrx and stvlx/stvrx combine to perform any unaligned 16-byte access: +// lvlx(EA) | lvrx(EA + 16) loads 16 bytes starting at unaligned EA. +// stvlx(EA); stvrx(EA + 16) stores 16 bytes starting at unaligned EA. +// +// Semantics per the AltiVec manual (and xenia-canary ppc_emit_memory.cc): +// lvlx: shift = EA & 0xF, n = 16 - shift. Loads mem[EA..EA+n] into +// lanes VR[0..n], zeros VR[n..16]. +// lvrx: shift = EA & 0xF. If shift == 0, VR = 0. Otherwise loads +// mem[EA-shift..EA] into lanes VR[16-shift..16], zeros VR[0..16-shift]. +// stvlx / stvrx are the symmetric stores. +// +// `Vec128::bytes[0]` is the most significant byte (PPC lane 0 in BE view). + +pub fn load_vector_left(mem: &dyn MemoryAccess, ea: u32) -> Vec128 { + let shift = (ea & 0xF) as usize; + let n = 16 - shift; + let mut bytes = [0u8; 16]; + for i in 0..n { + bytes[i] = mem.read_u8(ea.wrapping_add(i as u32)); + } + Vec128::from_bytes(bytes) +} + +pub fn load_vector_right(mem: &dyn MemoryAccess, ea: u32) -> Vec128 { + let shift = (ea & 0xF) as usize; + if shift == 0 { return Vec128::ZERO; } + let base = ea & !0xFu32; + let mut bytes = [0u8; 16]; + for i in 0..shift { + bytes[16 - shift + i] = mem.read_u8(base.wrapping_add(i as u32)); + } + Vec128::from_bytes(bytes) +} + +pub fn store_vector_left(mem: &dyn MemoryAccess, ea: u32, v: Vec128) { + let shift = (ea & 0xF) as usize; + let n = 16 - shift; + let b = v.as_bytes(); + for i in 0..n { + mem.write_u8(ea.wrapping_add(i as u32), b[i]); + } +} + +pub fn store_vector_right(mem: &dyn MemoryAccess, ea: u32, v: Vec128) { + let shift = (ea & 0xF) as usize; + if shift == 0 { return; } + let base = ea & !0xFu32; + let b = v.as_bytes(); + for i in 0..shift { + mem.write_u8(base.wrapping_add(i as u32), b[16 - shift + i]); + } +} + +// ─── 5-6-5 pixel pack (vpkpx / vupkhpx / vupklpx) ───────────────────────── +// PPC vpkpx takes a 32-bit RGB lane and packs it into a 16-bit 1-5-5-5 pixel. +// vupkhpx / vupklpx reverse the operation. +// +// Format: input 32-bit word holds +// bits 0-6: unused (0) +// bit 7: alpha-select (→ bit 15 of output) +// bits 8-15: R (top 5 bits kept) +// bits 16-23: G (top 5 bits kept) +// bits 24-31: B (top 5 bits kept) +// Output 16-bit word: +// bit 15: A (from input bit 7) +// bits 10-14: R +// bits 5-9: G +// bits 0-4: B + +#[inline] pub fn pack_pixel_555(input: u32) -> u16 { + let a = (input >> 7) & 0x1; + let r = (input >> 8) & 0xFF; + let g = (input >> 16) & 0xFF; + let b = (input >> 24) & 0xFF; + ((a << 15) | ((r & 0xF8) << 7) | ((g & 0xF8) << 2) | ((b & 0xF8) >> 3)) as u16 +} + +#[inline] pub fn unpack_pixel_555(input: u16) -> u32 { + let input = input as u32; + let a = (input >> 15) & 0x1; + let r = (input >> 10) & 0x1F; + let g = (input >> 5) & 0x1F; + let b = input & 0x1F; + // Sign-extend A and replicate 5-bit RGB into the top of each byte. + let a8 = if a != 0 { 0xFFu32 } else { 0 }; + let r8 = (r << 3) | (r >> 2); + let g8 = (g << 3) | (g >> 2); + let b8 = (b << 3) | (b >> 2); + (a8 << 24) | (r8 << 16) | (g8 << 8) | b8 +} + +// ─── VMX128 D3D pack/unpack dispatch ────────────────────────────────────── +// `vpkd3d128` / `vupkd3d128` encode a small enum in the instruction word +// (VX128_4 immediate field). The exact enum lives in canary's +// ppc_emit_altivec.cc under PACK_TYPE_*; titles usually touch D3DCOLOR +// (type 0) and a handful of texture-coordinate variants. +// +// Rather than risk getting a rarely-used sub-case wrong, we implement the +// common types and fall back to a warning + pass-through for unknown types. +// Returning the VB register value unchanged is always preferable to emitting +// StepResult::Unimplemented because it keeps the interpreter running. + +/// Pack-type encoding of `vpkd3d128` / `vupkd3d128`. +/// +/// The immediate field lives at PPC bits 16-22 (VX128_3/4 IMM, 7 bits). +/// Canary decodes `type = IMM >> 2` (top 5 bits) and `pack = IMM & 0x3` +/// (low 2 bits, used only by `vpkd3d128` to select output-slot layout). +/// Valid `type` values are 0..=6 per `ppc_emit_altivec.cc:2095-2118`: +/// +/// | id | canary name | format | +/// |----|-------------------|---------------------------------------| +/// | 0 | VPACK_D3DCOLOR | 4 f32 [0,1] ↔ ARGB8 | +/// | 1 | VPACK_NORMSHORT2 | 2 f32 [-1,1] ↔ 2× signed-normalized i16 | +/// | 2 | VPACK_NORMPACKED32| 4 f32 [-1,1] ↔ UINT_2101010 (w:2,z:10,y:10,x:10) | +/// | 3 | VPACK_FLOAT16_2 | 2 f32 ↔ 2× fp16 | +/// | 4 | VPACK_NORMSHORT4 | 4 f32 [-1,1] ↔ 4× signed-normalized i16 | +/// | 5 | VPACK_FLOAT16_4 | 4 f32 ↔ 4× fp16 | +/// | 6 | VPACK_NORMPACKED64| 4 f32 [-1,1] ↔ ULONG_4202020 (w:4,z:20,y:20,x:20) | +/// +/// Prior (M3-pre) this enum listed made-up "Normal16"/"Normal8"/"UByteN4" +/// variants that didn't match canary; the immediate extraction was also +/// wrong (LSB-numbered `>>6 & 0x7` instead of MSB-numbered `>>11 & 0x1F` +/// against a 7-bit IMM field). M3 fixes both. +#[derive(Debug, Clone, Copy)] +pub enum D3dPackType { + D3dColor, + NormShort2, + NormPacked32, + Float16_2, + NormShort4, + Float16_4, + NormPacked64, + Other(u32), +} + +impl D3dPackType { + /// Decode the `type` bits extracted from the VX128_3/4 IMM field via + /// canary's `IMM >> 2` convention (i.e. the caller has already divided + /// out the 2-bit `pack` subfield). + pub fn from_immediate(type_bits: u32) -> Self { + match type_bits { + 0 => Self::D3dColor, + 1 => Self::NormShort2, + 2 => Self::NormPacked32, + 3 => Self::Float16_2, + 4 => Self::NormShort4, + 5 => Self::Float16_4, + 6 => Self::NormPacked64, + other => Self::Other(other), + } + } +} + +/// Pack an f32x4 vector of [R, G, B, A] in [0.0, 1.0] into a single D3DCOLOR +/// value in lane 3 of the output. +pub fn pack_d3dcolor(v: Vec128) -> Vec128 { + let f = v.as_f32x4(); + let to_byte = |x: f32| -> u32 { + let c = x.clamp(0.0, 1.0) * 255.0; + (c + 0.5) as u32 & 0xFF + }; + // D3DCOLOR is A,R,G,B in that byte order inside a u32. + let word = (to_byte(f[3]) << 24) | (to_byte(f[0]) << 16) | (to_byte(f[1]) << 8) | to_byte(f[2]); + Vec128::from_u32x4(0, 0, 0, word) +} + +/// Unpack a D3DCOLOR value (in lane 3 of the input) into an f32x4 [R, G, B, A]. +pub fn unpack_d3dcolor(v: Vec128) -> Vec128 { + let word = v.u32x4(3); + let a = ((word >> 24) & 0xFF) as f32 / 255.0; + let r = ((word >> 16) & 0xFF) as f32 / 255.0; + let g = ((word >> 8) & 0xFF) as f32 / 255.0; + let b = (word & 0xFF) as f32 / 255.0; + Vec128::from_f32x4(r, g, b, a) +} + +// ─────────────────────────────────────────────────────────────────────── +// First-Pixels M3 — pack/unpack for the remaining canary pack types. +// +// Conventions shared across all helpers: +// * Input-to-`unpack_*` (packed data) lives in the *source* lane position +// canary's HIR assumes: canonically the 32-bit word is in lane 3 and +// the 64-bit value straddles lanes 2-3. We match that so the existing +// D3DCOLOR helpers' 3-lane convention is preserved across the whole +// pack-type family. +// * Output-from-`pack_*` sits in the same lane(s). The caller usually +// follows with a permute to move it elsewhere (the VX128_4 `pack` +// subfield controls that in `vpkd3d128`). +// * Range semantics match canary: normalized types use `max` = (1< i16 { + let c = x.clamp(-1.0, 1.0) * 32767.0; + // Round half away from zero, matching canary's `vcfsx` semantics. + let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 }; + r.clamp(-32768, 32767) as i16 +} + +#[inline] +fn i16_to_norm(s: i16) -> f32 { + (s as f32) / 32767.0 +} + +/// **NORMSHORT2** — 2 f32s in [-1, 1] → two 16-bit signed-normalized +/// shorts packed as `(x << 16) | y` in lane 3 (high 32 bits of the word +/// hold X; low 16 hold Y). Output lanes 0..=2 are zero-filled. +pub fn pack_normshort2(v: Vec128) -> Vec128 { + let f = v.as_f32x4(); + let x = norm_to_i16(f[0]) as u16 as u32; + let y = norm_to_i16(f[1]) as u16 as u32; + Vec128::from_u32x4(0, 0, 0, (x << 16) | y) +} + +pub fn unpack_normshort2(v: Vec128) -> Vec128 { + let word = v.u32x4(3); + let x = i16_to_norm((word >> 16) as i16); + let y = i16_to_norm(word as i16); + Vec128::from_f32x4(x, y, 0.0, 1.0) +} + +/// **NORMSHORT4** — 4 f32s in [-1, 1] → four 16-bit signed-normalized +/// shorts packed across lanes 2-3 (big-endian dword order: X in the +/// high word of lane 2, Y low of lane 2, Z high of lane 3, W low of lane +/// 3). +pub fn pack_normshort4(v: Vec128) -> Vec128 { + let f = v.as_f32x4(); + let x = norm_to_i16(f[0]) as u16 as u32; + let y = norm_to_i16(f[1]) as u16 as u32; + let z = norm_to_i16(f[2]) as u16 as u32; + let w = norm_to_i16(f[3]) as u16 as u32; + Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w) +} + +pub fn unpack_normshort4(v: Vec128) -> Vec128 { + let hi = v.u32x4(2); + let lo = v.u32x4(3); + let x = i16_to_norm((hi >> 16) as i16); + let y = i16_to_norm(hi as i16); + let z = i16_to_norm((lo >> 16) as i16); + let w = i16_to_norm(lo as i16); + Vec128::from_f32x4(x, y, z, w) +} + +/// **NORMPACKED32** — UINT_2101010 layout, 4 f32s in [-1, 1] packed into +/// 32 bits in lane 3. Per canary's comment `2_10_10_10 w_z_y_x`: the +/// high 2 bits hold W (signed 2-bit, -2..=1), then Z/Y/X each use 10 +/// signed-normalized bits. +pub fn pack_normpacked32(v: Vec128) -> Vec128 { + let f = v.as_f32x4(); + #[inline] + fn n10(x: f32) -> u32 { + let c = x.clamp(-1.0, 1.0) * 511.0; + let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 }; + (r.clamp(-512, 511) as i32 as u32) & 0x3FF + } + #[inline] + fn n2(x: f32) -> u32 { + let c = x.clamp(-1.0, 1.0) * 1.0; + let r = if c >= 0.0 { (c + 0.5) as i32 } else { (c - 0.5) as i32 }; + (r.clamp(-2, 1) as i32 as u32) & 0x3 + } + let x = n10(f[0]); + let y = n10(f[1]); + let z = n10(f[2]); + let w = n2(f[3]); + let word = (w << 30) | (z << 20) | (y << 10) | x; + Vec128::from_u32x4(0, 0, 0, word) +} + +pub fn unpack_normpacked32(v: Vec128) -> Vec128 { + let word = v.u32x4(3); + #[inline] + fn u10_to_norm(bits: u32) -> f32 { + // Sign-extend the 10-bit field then normalize. + let s = ((bits & 0x3FF) as i32) << 22 >> 22; + (s as f32) / 511.0 + } + #[inline] + fn u2_to_norm(bits: u32) -> f32 { + let s = ((bits & 0x3) as i32) << 30 >> 30; + (s as f32).clamp(-1.0, 1.0) + } + let x = u10_to_norm(word); + let y = u10_to_norm(word >> 10); + let z = u10_to_norm(word >> 20); + let w = u2_to_norm(word >> 30); + Vec128::from_f32x4(x, y, z, w) +} + +/// **NORMPACKED64** — ULONG_4202020, 4 f32s in [-1, 1] packed into 64 +/// bits across lanes 2-3. Per canary's comment `4_20_20_20 w_z_y_x`: +/// the high 4 bits of the dword hold W (signed 4-bit); the remaining 60 +/// bits hold 3× 20-bit signed-normalized Z/Y/X. Rare outside very few +/// titles (canary notes 54540829). +pub fn pack_normpacked64(v: Vec128) -> Vec128 { + let f = v.as_f32x4(); + #[inline] + fn n20(x: f32) -> u64 { + let c = x.clamp(-1.0, 1.0) * 524287.0; // 2^19 - 1 + let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 }; + (r.clamp(-524288, 524287) as i64 as u64) & 0xF_FFFF + } + #[inline] + fn n4(x: f32) -> u64 { + let c = x.clamp(-1.0, 1.0) * 7.0; + let r = if c >= 0.0 { (c + 0.5) as i64 } else { (c - 0.5) as i64 }; + (r.clamp(-8, 7) as i64 as u64) & 0xF + } + let x = n20(f[0]); + let y = n20(f[1]); + let z = n20(f[2]); + let w = n4(f[3]); + let dw: u64 = (w << 60) | (z << 40) | (y << 20) | x; + Vec128::from_u32x4(0, 0, (dw >> 32) as u32, dw as u32) +} + +pub fn unpack_normpacked64(v: Vec128) -> Vec128 { + let hi = v.u32x4(2) as u64; + let lo = v.u32x4(3) as u64; + let dw = (hi << 32) | lo; + #[inline] + fn u20_to_norm(bits: u64) -> f32 { + let s = ((bits & 0xF_FFFF) as i64) << 44 >> 44; + (s as f32) / 524287.0 + } + #[inline] + fn u4_to_norm(bits: u64) -> f32 { + let s = ((bits & 0xF) as i64) << 60 >> 60; + (s as f32) / 7.0 + } + let x = u20_to_norm(dw); + let y = u20_to_norm(dw >> 20); + let z = u20_to_norm(dw >> 40); + let w = u4_to_norm(dw >> 60); + Vec128::from_f32x4(x, y, z, w) +} + +/// IEEE 754 half-precision float pack/unpack — used by both FLOAT16_2 +/// and FLOAT16_4. No FMA quirks involved; we go via `f32::to_bits` and +/// manual bit-twiddling (the stable-Rust `f16` type isn't available +/// yet). +#[inline] +fn f32_to_f16_bits(f: f32) -> u16 { + let bits = f.to_bits(); + let sign = ((bits >> 31) & 0x1) as u16; + let exp = ((bits >> 23) & 0xFF) as i32; + let mant = bits & 0x7FFFFF; + // Handle the easy cases first. + if exp == 0xFF { + // NaN or infinity. + let half_exp = 0x1F; + let half_mant = if mant != 0 { 0x200 } else { 0 }; // quiet NaN / zero mantissa for Inf + return (sign << 15) | (half_exp << 10) | half_mant; + } + let unbiased_exp = exp - 127; + if unbiased_exp >= 16 { + // Overflow → infinity. + return (sign << 15) | (0x1F << 10); + } + if unbiased_exp <= -15 { + // Denormal or zero. Compute the shift and subnormal mantissa; + // anything too small flushes to signed zero. + if unbiased_exp < -24 { + return sign << 15; + } + let shift = -14 - unbiased_exp as i32; // amount to shift the implicit-1'd mantissa + let full_mant = 0x800000 | mant; // 24 bits with implicit leading 1 + let half_mant = (full_mant >> (shift + 13)) as u16; + return (sign << 15) | half_mant; + } + let half_exp = ((unbiased_exp + 15) as u16) & 0x1F; + let half_mant = (mant >> 13) as u16; + (sign << 15) | (half_exp << 10) | half_mant +} + +#[inline] +fn f16_bits_to_f32(h: u16) -> f32 { + let sign = ((h >> 15) & 0x1) as u32; + let exp = ((h >> 10) & 0x1F) as i32; + let mant = (h & 0x3FF) as u32; + let bits = if exp == 0x1F { + // NaN or infinity. + let f32_exp = 0xFFu32; + let f32_mant = if mant != 0 { 0x400000 } else { 0 }; + (sign << 31) | (f32_exp << 23) | f32_mant + } else if exp == 0 && mant == 0 { + // Signed zero. + sign << 31 + } else if exp == 0 { + // Subnormal — renormalize. + let mut e = -14i32; + let mut m = mant; + while (m & 0x400) == 0 { + m <<= 1; + e -= 1; + } + let f32_exp = ((e + 127) as u32) & 0xFF; + let f32_mant = (m & 0x3FF) << 13; + (sign << 31) | (f32_exp << 23) | f32_mant + } else { + let f32_exp = ((exp - 15 + 127) as u32) & 0xFF; + let f32_mant = mant << 13; + (sign << 31) | (f32_exp << 23) | f32_mant + }; + f32::from_bits(bits) +} + +/// **FLOAT16_2** — two 32-bit floats → two half-floats packed into one +/// 32-bit word (X in high 16 bits of lane 3, Y in low 16). +pub fn pack_float16_2(v: Vec128) -> Vec128 { + let f = v.as_f32x4(); + let x = f32_to_f16_bits(f[0]) as u32; + let y = f32_to_f16_bits(f[1]) as u32; + Vec128::from_u32x4(0, 0, 0, (x << 16) | y) +} + +pub fn unpack_float16_2(v: Vec128) -> Vec128 { + let word = v.u32x4(3); + let x = f16_bits_to_f32((word >> 16) as u16); + let y = f16_bits_to_f32(word as u16); + Vec128::from_f32x4(x, y, 0.0, 1.0) +} + +/// **FLOAT16_4** — four 32-bit floats → four half-floats packed across +/// 64 bits (lanes 2-3). +pub fn pack_float16_4(v: Vec128) -> Vec128 { + let f = v.as_f32x4(); + let x = f32_to_f16_bits(f[0]) as u32; + let y = f32_to_f16_bits(f[1]) as u32; + let z = f32_to_f16_bits(f[2]) as u32; + let w = f32_to_f16_bits(f[3]) as u32; + Vec128::from_u32x4(0, 0, (x << 16) | y, (z << 16) | w) +} + +pub fn unpack_float16_4(v: Vec128) -> Vec128 { + let hi = v.u32x4(2); + let lo = v.u32x4(3); + let x = f16_bits_to_f32((hi >> 16) as u16); + let y = f16_bits_to_f32(hi as u16); + let z = f16_bits_to_f32((lo >> 16) as u16); + let w = f16_bits_to_f32(lo as u16); + Vec128::from_f32x4(x, y, z, w) +} + +// ─── CR6 helpers used by integer compares ───────────────────────────────── +// vcmp*. (record-form) updates CR6 in a compressed form: +// CR6 = {all-true, 0, all-false, 0} +// where each bit reflects the per-lane mask across the whole register. + +#[inline] pub fn cr6_flags_from_mask(mask: Vec128) -> (bool, bool) { + let b = mask.as_bytes(); + let mut any_set = false; + let mut any_clear = false; + for &byte in b.iter() { + if byte != 0 { any_set = true; } + if byte != 0xFF { any_clear = true; } + } + let all_true = !any_clear; + let all_false = !any_set; + (all_true, all_false) +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::cell::Cell; + + struct TestMem { data: Box<[Cell]> } + impl TestMem { + fn new(size: usize) -> Self { + Self { data: (0..size).map(|_| Cell::new(0)).collect() } + } + } + impl MemoryAccess for TestMem { + fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() } + fn read_u16(&self, a: u32) -> u16 { + u16::from_be_bytes([self.data[a as usize].get(), self.data[a as usize + 1].get()]) + } + fn read_u32(&self, a: u32) -> u32 { + let a = a as usize; + u32::from_be_bytes([ + self.data[a].get(), self.data[a+1].get(), + self.data[a+2].get(), self.data[a+3].get(), + ]) + } + fn read_u64(&self, a: u32) -> u64 { + let a = a as usize; + u64::from_be_bytes([ + self.data[a].get(), self.data[a+1].get(), + self.data[a+2].get(), self.data[a+3].get(), + self.data[a+4].get(), self.data[a+5].get(), + self.data[a+6].get(), self.data[a+7].get(), + ]) + } + fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); } + fn write_u16(&self, a: u32, v: u16) { + let b = v.to_be_bytes(); + self.data[a as usize].set(b[0]); + self.data[a as usize + 1].set(b[1]); + } + fn write_u32(&self, a: u32, v: u32) { + let b = v.to_be_bytes(); let a = a as usize; + for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); } + } + fn write_u64(&self, a: u32, v: u64) { + let b = v.to_be_bytes(); let a = a as usize; + for (i, byte) in b.iter().enumerate() { self.data[a+i].set(*byte); } + } + fn translate(&self, _a: u32) -> Option<*const u8> { None } + fn translate_mut(&self, _a: u32) -> Option<*mut u8> { None } + } + + #[test] + fn lvlx_lvrx_round_trip() { + let m = TestMem::new(0x40); + for i in 0..0x30 { m.data[i].set((i as u8).wrapping_add(0x10)); } + // Unaligned load from 0x13 should combine lvlx(0x13) | lvrx(0x23). + let lo = load_vector_left(&m, 0x13); + let hi = load_vector_right(&m, 0x23); + let mut combined = [0u8; 16]; + let lob = lo.as_bytes(); + let hib = hi.as_bytes(); + for i in 0..16 { combined[i] = lob[i] | hib[i]; } + for i in 0..16 { + assert_eq!(combined[i], m.data[0x13 + i].get(), "lane {}", i); + } + } + + #[test] + fn lvlx_aligned_is_full_load() { + let m = TestMem::new(0x20); + for i in 0..0x20 { m.data[i].set(i as u8); } + let v = load_vector_left(&m, 0x10); + let b = v.as_bytes(); + for i in 0..16 { assert_eq!(b[i], 0x10 + i as u8); } + } + + #[test] + fn lvrx_aligned_is_zero() { + let m = TestMem::new(0x20); + let v = load_vector_right(&m, 0x10); + assert_eq!(v.as_bytes(), [0u8; 16]); + } + + #[test] + fn sat_add_signed_overflow() { + assert_eq!(sat_add_i8(120, 10), (127, true)); + assert_eq!(sat_add_i8(-120, -10), (-128, true)); + assert_eq!(sat_add_i8(1, 2), (3, false)); + } + + #[test] + fn sat_sub_unsigned_underflow() { + assert_eq!(sat_sub_u8(5, 10), (0, true)); + assert_eq!(sat_sub_u8(10, 5), (5, false)); + } + + #[test] + fn pack_unpack_pixel_555() { + let encoded = pack_pixel_555(0x80_F8_F8_F8); + assert_eq!(encoded & 0x8000, 0x8000); + let w = unpack_pixel_555(0x8000 | (0x1F << 10) | (0x1F << 5) | 0x1F); + assert_eq!(w & 0xFF000000, 0xFF000000); + } + + // ─── First-Pixels M3 pack/unpack roundtrip tests ─── + + /// Quantization error tolerance for N-bit signed normalized values. + /// `1.0 / ((1 << (bits - 1)) - 1)` is the step size. + fn tol_normalized(bits: u32) -> f32 { + 1.0 / ((1u32 << (bits - 1)) - 1) as f32 + } + + #[test] + fn normshort2_roundtrip() { + let v = Vec128::from_f32x4(0.5, -0.75, 0.0, 0.0); + let packed = pack_normshort2(v); + let back = unpack_normshort2(packed).as_f32x4(); + let tol = tol_normalized(16); + assert!((back[0] - 0.5).abs() < tol, "x got {}", back[0]); + assert!((back[1] - -0.75).abs() < tol, "y got {}", back[1]); + assert_eq!(back[2], 0.0); + assert_eq!(back[3], 1.0); + } + + #[test] + fn normshort4_roundtrip_extremes() { + let v = Vec128::from_f32x4(1.0, -1.0, 0.0, 0.25); + let packed = pack_normshort4(v); + let back = unpack_normshort4(packed).as_f32x4(); + let tol = tol_normalized(16); + assert!((back[0] - 1.0).abs() < tol); + assert!((back[1] - -1.0).abs() < tol); + assert!((back[2] - 0.0).abs() < tol); + assert!((back[3] - 0.25).abs() < tol); + } + + #[test] + fn normpacked32_roundtrip() { + let v = Vec128::from_f32x4(0.5, -0.5, 0.9, -1.0); + let packed = pack_normpacked32(v); + let back = unpack_normpacked32(packed).as_f32x4(); + let tol10 = tol_normalized(10); + let tol2 = tol_normalized(2); + assert!((back[0] - 0.5).abs() < tol10, "x got {}", back[0]); + assert!((back[1] - -0.5).abs() < tol10, "y got {}", back[1]); + assert!((back[2] - 0.9).abs() < tol10, "z got {}", back[2]); + // 2-bit signed quantizes to {-1, -0.5-ish, 0, 0.5-ish}; tolerance + // is the full step. + assert!((back[3] - -1.0).abs() < 2.0 * tol2, "w got {}", back[3]); + } + + #[test] + fn normpacked64_roundtrip() { + let v = Vec128::from_f32x4(0.5, -0.25, 0.75, 0.5); + let packed = pack_normpacked64(v); + let back = unpack_normpacked64(packed).as_f32x4(); + let tol20 = tol_normalized(20); + let tol4 = tol_normalized(4); + assert!((back[0] - 0.5).abs() < tol20, "x got {}", back[0]); + assert!((back[1] - -0.25).abs() < tol20, "y got {}", back[1]); + assert!((back[2] - 0.75).abs() < tol20, "z got {}", back[2]); + assert!((back[3] - 0.5).abs() < tol4, "w got {}", back[3]); + } + + #[test] + fn float16_2_roundtrip_normals() { + // Half has ~3 decimal digits of precision. Pick values that + // survive conversion cleanly: powers of 2 + simple fractions. + let v = Vec128::from_f32x4(1.0, -2.5, 0.0, 0.0); + let packed = pack_float16_2(v); + let back = unpack_float16_2(packed).as_f32x4(); + assert_eq!(back[0], 1.0); + assert_eq!(back[1], -2.5); + assert_eq!(back[2], 0.0); + assert_eq!(back[3], 1.0); + } + + #[test] + fn float16_4_roundtrip_normals() { + let v = Vec128::from_f32x4(0.5, -3.0, 16.0, -0.125); + let packed = pack_float16_4(v); + let back = unpack_float16_4(packed).as_f32x4(); + assert_eq!(back[0], 0.5); + assert_eq!(back[1], -3.0); + assert_eq!(back[2], 16.0); + assert_eq!(back[3], -0.125); + } + + #[test] + fn float16_handles_zero_and_infinity() { + // Zero should survive. + assert_eq!(f16_bits_to_f32(f32_to_f16_bits(0.0)), 0.0); + assert_eq!(f16_bits_to_f32(f32_to_f16_bits(-0.0)).to_bits(), (-0.0f32).to_bits()); + // +inf. + let inf_back = f16_bits_to_f32(f32_to_f16_bits(f32::INFINITY)); + assert!(inf_back.is_infinite() && inf_back > 0.0); + // Overflow → +inf. + let overflow_back = f16_bits_to_f32(f32_to_f16_bits(65536.0)); + assert!(overflow_back.is_infinite()); + } + + #[test] + fn pack_type_enum_maps_canary_values() { + use D3dPackType::*; + assert!(matches!(D3dPackType::from_immediate(0), D3dColor)); + assert!(matches!(D3dPackType::from_immediate(1), NormShort2)); + assert!(matches!(D3dPackType::from_immediate(2), NormPacked32)); + assert!(matches!(D3dPackType::from_immediate(3), Float16_2)); + assert!(matches!(D3dPackType::from_immediate(4), NormShort4)); + assert!(matches!(D3dPackType::from_immediate(5), Float16_4)); + assert!(matches!(D3dPackType::from_immediate(6), NormPacked64)); + assert!(matches!(D3dPackType::from_immediate(7), Other(7))); + } +} diff --git a/crates/xenia-cpu/tests/disasm_goldens.rs b/crates/xenia-cpu/tests/disasm_goldens.rs new file mode 100644 index 0000000..6c39d54 --- /dev/null +++ b/crates/xenia-cpu/tests/disasm_goldens.rs @@ -0,0 +1,531 @@ +//! Assert-based goldens for the PPC disassembler. +//! +//! Each test owns an inline list of `(raw, addr, label)` cases. On a +//! normal run, the test reads the corresponding fixture JSON and asserts +//! that `format(decode(raw, addr))` reproduces every field exactly. On +//! first creation (fixture file missing) or with `REGEN_GOLDENS=1` set, +//! the test (re)writes the fixture from `format()` output. +//! +//! Workflow: +//! ```sh +//! cargo test -p xenia-cpu --test disasm_goldens # assert +//! REGEN_GOLDENS=1 cargo test -p xenia-cpu --test disasm_goldens # regen +//! ``` +//! +//! The hand-encoded test cases below cover the silent-bug regression +//! cases that lived in the old println-based `disasm_audit.rs` harness +//! (now deleted). + +use std::path::PathBuf; + +use serde::{Deserialize, Serialize}; + +use xenia_cpu::decoder::decode; +use xenia_cpu::disasm::format; + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +struct GoldenRow { + label: String, + raw: String, + addr: String, + mnemonic: String, + operands: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + ext_mnemonic: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + ext_operands: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + branch_target: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct GoldenFile { + rows: Vec, +} + +fn fixture_path(name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("golden") + .join(name) +} + +fn build_rows(cases: &[(u32, u32, &str)]) -> Vec { + cases + .iter() + .map(|&(raw, addr, label)| { + let d = decode(raw, addr); + let t = format(&d); + GoldenRow { + label: label.to_string(), + raw: format!("0x{raw:08X}"), + addr: format!("0x{addr:08X}"), + mnemonic: t.mnemonic, + operands: t.operands, + ext_mnemonic: t.ext_mnemonic, + ext_operands: t.ext_operands, + branch_target: t.branch_target.map(|t| format!("0x{t:08X}")), + } + }) + .collect() +} + +/// Compare what `format()` produces against the committed JSON snapshot. +/// Set `REGEN_GOLDENS=1` to overwrite the snapshot from current output. +/// Missing snapshot is treated as "first creation": writes and panics so +/// CI can't accidentally accept blank goldens. +fn assert_or_regen(fixture_name: &str, cases: &[(u32, u32, &str)]) { + let rows = build_rows(cases); + let path = fixture_path(fixture_name); + let regen = std::env::var("REGEN_GOLDENS").is_ok(); + + if regen || !path.exists() { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + let serialized = serde_json::to_string_pretty(&GoldenFile { rows }).unwrap(); + std::fs::write(&path, serialized + "\n").unwrap(); + if !regen { + panic!( + "Generated fixture {} (was missing). Inspect, commit, then re-run.", + path.display() + ); + } + return; + } + + let src = std::fs::read_to_string(&path).unwrap(); + let golden: GoldenFile = serde_json::from_str(&src).unwrap(); + assert_eq!( + rows.len(), + golden.rows.len(), + "row count differs from {} (live={}, fixture={}). Run with REGEN_GOLDENS=1 if the test cases changed intentionally.", + path.display(), + rows.len(), + golden.rows.len() + ); + for (i, (got, expected)) in rows.iter().zip(golden.rows.iter()).enumerate() { + assert_eq!( + got, expected, + "row {} ({}) differs in {}\n live: {got:#?}\n fixture: {expected:#?}", + i, + expected.label, + path.display() + ); + } +} + +// ── Encoding helpers ──────────────────────────────────────────────────────── +// PPC bit numbering: bit 0 is MSB, bit 31 is LSB. Most helpers below emit +// instructions in canonical hand-readable form: opcode << 26 | . + +#[allow(clippy::too_many_arguments)] +fn xform_xo3(rd: u32, ra: u32, rb: u32, oe: u32, xo: u32, rc: u32) -> u32 { + (31 << 26) | (rd << 21) | (ra << 16) | (rb << 11) | (oe << 10) | (xo << 1) | rc +} + +fn xform_logic(rs: u32, ra: u32, rb: u32, xo: u32, rc: u32) -> u32 { + (31 << 26) | (rs << 21) | (ra << 16) | (rb << 11) | (xo << 1) | rc +} + +fn dform(op: u32, rt: u32, ra: u32, imm: i16) -> u32 { + (op << 26) | (rt << 21) | (ra << 16) | ((imm as u16) as u32) +} + +fn iform_b(target_disp: i32, aa: u32, lk: u32) -> u32 { + // I-form: opcode 18 | LI<<2 | AA<<1 | LK + let li = (target_disp as u32) & 0x03FF_FFFC; + (18 << 26) | li | (aa << 1) | lk +} + +fn bform_bc(bo: u32, bi: u32, target_disp: i32, aa: u32, lk: u32) -> u32 { + // B-form: opcode 16 | BO<<21 | BI<<16 | BD<<2 | AA<<1 | LK + let bd = (target_disp as u32) & 0x0000_FFFC; + (16 << 26) | (bo << 21) | (bi << 16) | bd | (aa << 1) | lk +} + +fn xlform_bclr(bo: u32, bi: u32, lk: u32) -> u32 { + // XL-form: opcode 19 | BO<<21 | BI<<16 | XO=16<<1 | LK + (19 << 26) | (bo << 21) | (bi << 16) | (16 << 1) | lk +} + +fn xlform_bcctr(bo: u32, bi: u32, lk: u32) -> u32 { + (19 << 26) | (bo << 21) | (bi << 16) | (528 << 1) | lk +} + +fn rlwinm(rs: u32, ra: u32, sh: u32, mb: u32, me: u32, rc: u32) -> u32 { + (21 << 26) | (rs << 21) | (ra << 16) | (sh << 11) | (mb << 6) | (me << 1) | rc +} + +fn rldicl(rs: u32, ra: u32, sh: u32, mb: u32, rc: u32) -> u32 { + // MD-form, op30 xo=0. sh split: bits 16-20 (high 5) + bit 30 (low bit). + // mb split: bits 21-25 (low 5) + bit 26 (high bit). + let sh_hi = (sh >> 1) & 0x1F; + let sh_lo = sh & 1; + let mb_lo = mb & 0x1F; + let mb_hi = (mb >> 5) & 1; + (30 << 26) + | (rs << 21) + | (ra << 16) + | (sh_hi << 11) + | (mb_lo << 6) + | (mb_hi << 5) + | (0 << 2) + | (sh_lo << 1) + | rc +} + +fn mfspr(rd: u32, spr: u32) -> u32 { + let spr_swapped = ((spr & 0x1F) << 5) | ((spr >> 5) & 0x1F); + (31 << 26) | (rd << 21) | (spr_swapped << 11) | (339 << 1) +} + +fn mtspr(rs: u32, spr: u32) -> u32 { + let spr_swapped = ((spr & 0x1F) << 5) | ((spr >> 5) & 0x1F); + (31 << 26) | (rs << 21) | (spr_swapped << 11) | (467 << 1) +} + +// ── Tests ─────────────────────────────────────────────────────────────────── + +#[test] +fn base_mnemonics() { + let cases: &[(u32, u32, &str)] = &[ + // X-form ALU (Rc and OE bits) + (xform_xo3(3, 4, 5, 0, 266, 0), 0x82000000, "add r3,r4,r5"), + (xform_xo3(3, 4, 5, 0, 266, 1), 0x82000000, "add. r3,r4,r5"), + (xform_xo3(3, 4, 5, 1, 266, 0), 0x82000000, "addo r3,r4,r5"), + (xform_xo3(3, 4, 5, 1, 266, 1), 0x82000000, "addo. r3,r4,r5"), + (xform_xo3(3, 4, 0, 0, 104, 0), 0x82000000, "neg r3,r4"), + (xform_xo3(3, 4, 5, 0, 235, 0), 0x82000000, "mullw r3,r4,r5"), + (xform_xo3(3, 4, 5, 0, 491, 0), 0x82000000, "divw r3,r4,r5"), + (xform_xo3(3, 4, 5, 0, 75, 1), 0x82000000, "mulhw. r3,r4,r5"), + (xform_xo3(3, 4, 5, 0, 11, 1), 0x82000000, "mulhwu. r3,r4,r5"), + (xform_xo3(3, 4, 5, 0, 233, 0), 0x82000000, "mulld r3,r4,r5"), + // X-form logical + (xform_logic(4, 3, 5, 28, 0), 0x82000000, "and r3,r4,r5"), + (xform_logic(4, 3, 5, 444, 0), 0x82000000, "or r3,r4,r5 (non-mr: rs!=rb)"), + (xform_logic(4, 3, 5, 316, 0), 0x82000000, "xor r3,r4,r5"), + (xform_logic(4, 3, 5, 124, 0), 0x82000000, "nor r3,r4,r5"), + (xform_logic(4, 3, 5, 476, 0), 0x82000000, "nand r3,r4,r5"), + (xform_logic(4, 3, 5, 284, 0), 0x82000000, "eqv r3,r4,r5"), + (xform_logic(4, 3, 5, 60, 0), 0x82000000, "andc r3,r4,r5"), + (xform_logic(4, 3, 5, 412, 0), 0x82000000, "orc r3,r4,r5"), + // X-form shift + (xform_logic(4, 3, 5, 24, 0), 0x82000000, "slw r3,r4,r5"), + (xform_logic(4, 3, 5, 536, 0), 0x82000000, "srw r3,r4,r5"), + (xform_logic(4, 3, 5, 792, 0), 0x82000000, "sraw r3,r4,r5"), + (xform_logic(4, 3, 5, 27, 0), 0x82000000, "sld r3,r4,r5"), + (xform_logic(4, 3, 5, 539, 0), 0x82000000, "srd r3,r4,r5"), + // srawi / sradi (immediate shifts) + ((31 << 26) | (4 << 21) | (3 << 16) | (16 << 11) | (824 << 1), 0x82000000, "srawi r3,r4,16"), + // Atomics + ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (150 << 1) | 1, 0x82000000, "stwcx. r3,r4,r5"), + ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (214 << 1) | 1, 0x82000000, "stdcx. r3,r4,r5"), + ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1), 0x82000000, "lwarx r3,r4,r5"), + ((31 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (84 << 1), 0x82000000, "ldarx r3,r4,r5"), + // Compares + (dform(11, 0, 3, 16), 0x82000000, "cmpwi cr0, r3, 16"), + (dform(11, 2 << 2, 3, 16), 0x82000000, "cmpwi cr2, r3, 16"), + (dform(10, 0, 3, 16), 0x82000000, "cmplwi cr0, r3, 16"), + ((31 << 26) | (3 << 16) | (4 << 11), 0x82000000, "cmpw r3,r4 in cr0"), + ((31 << 26) | (1 << 21) | (3 << 16) | (4 << 11), 0x82000000, "cmpd r3,r4"), + ((31 << 26) | (3 << 16) | (4 << 11) | (32 << 1), 0x82000000, "cmplw r3,r4"), + // D-form ALU/load/store + (dform(14, 3, 1, 16), 0x82000000, "addi r3, r1, 16"), + (dform(15, 3, 1, 0x100), 0x82000000, "addis r3, r1, 0x100 (ra!=0)"), + (dform(7, 3, 4, 5), 0x82000000, "mulli r3, r4, 5"), + (dform(8, 3, 4, 5), 0x82000000, "subfic r3, r4, 5"), + (dform(12, 3, 4, 16), 0x82000000, "addic r3, r4, 16"), + (dform(13, 3, 4, 16), 0x82000000, "addic. r3, r4, 16"), + (dform(24, 3, 4, 0x10), 0x82000000, "ori r4, r3, 0x10 (non-nop)"), + (dform(25, 3, 4, 0x10), 0x82000000, "oris r4, r3, 0x10"), + (dform(26, 3, 4, 0x10), 0x82000000, "xori r4, r3, 0x10"), + (dform(28, 3, 4, 0x10), 0x82000000, "andi. r4, r3, 0x10"), + // Loads/stores D-form + (dform(32, 5, 1, 0x20), 0x82000000, "lwz r5, 0x20(r1)"), + (dform(36, 5, 1, 0x20), 0x82000000, "stw r5, 0x20(r1)"), + (dform(34, 5, 1, 0x20), 0x82000000, "lbz r5, 0x20(r1)"), + (dform(40, 5, 1, 0x20), 0x82000000, "lhz r5, 0x20(r1)"), + (dform(48, 5, 1, 0x20), 0x82000000, "lfs f5, 0x20(r1)"), + (dform(50, 5, 1, 0x20), 0x82000000, "lfd f5, 0x20(r1)"), + (dform(54, 5, 1, 0x20), 0x82000000, "stfd f5, 0x20(r1)"), + // DS-form 64-bit loads + ((58u32 << 26) | (5 << 21) | (1 << 16) | 0x20, 0x82000000, "ld r5, 0x20(r1)"), + ((62u32 << 26) | (5 << 21) | (1 << 16) | 0x20, 0x82000000, "std r5, 0x20(r1)"), + // Sync / barrier (parameterless) + ((31 << 26) | (598 << 1), 0x82000000, "sync 0 (extends to sync)"), + ((19 << 26) | (150 << 1), 0x82000000, "isync"), + ((31 << 26) | (854 << 1), 0x82000000, "eieio"), + // Cache hints + ((31 << 26) | (1 << 16) | (2 << 11) | (54 << 1), 0x82000000, "dcbst r1, r2"), + ((31 << 26) | (1 << 16) | (2 << 11) | (86 << 1), 0x82000000, "dcbf r1, r2"), + ((31 << 26) | (1 << 16) | (2 << 11) | (278 << 1), 0x82000000, "dcbt r1, r2"), + ((31 << 26) | (1 << 16) | (2 << 11) | (1014 << 1), 0x82000000, "dcbz r1, r2"), + ((31 << 26) | (1 << 21) | (1 << 16) | (2 << 11) | (1014 << 1), 0x82000000, "dcbz128 r1, r2"), + // CR logical (without simplification triggers) + ((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (33 << 1), 0x82000000, "crnor 4,5,6 (no simplify)"), + ((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (257 << 1), 0x82000000, "crand 4,5,6"), + ((19 << 26) | (4 << 21) | (5 << 16) | (6 << 11) | (449 << 1), 0x82000000, "cror 4,5,6 (no simplify)"), + // Trap (no simplification: TO=11 doesn't match the table) + ((31 << 26) | (11 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "tw 11, r3, r4 (uncommon TO)"), + ((2u32 << 26) | (11 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "tdi 11, r3, 123"), + // mtcr (extended): mtcrf 0xFF, r5 + ((31 << 26) | (5 << 21) | (0xFF << 12) | (144 << 1), 0x82000000, "mtcrf 0xFF, r5 → mtcr"), + // mfcr / mfmsr / mtmsr / mtmsrd + ((31 << 26) | (5 << 21) | (19 << 1), 0x82000000, "mfcr r5"), + ((31 << 26) | (5 << 21) | (83 << 1), 0x82000000, "mfmsr r5"), + ((31 << 26) | (5 << 21) | (146 << 1), 0x82000000, "mtmsr r5"), + ((31 << 26) | (5 << 21) | (178 << 1), 0x82000000, "mtmsrd r5"), + // FPU base + ((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (21 << 1), 0x82000000, "fadd f3, f4, f5"), + ((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (20 << 1), 0x82000000, "fsub f3, f4, f5"), + ((63u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (18 << 1), 0x82000000, "fdiv f3, f4, f5"), + ((63u32 << 26) | (3 << 21) | (5 << 21) | (5 << 11) | (25 << 1), 0x82000000, "fmul f3, f0, f5 (encoded)"), + ((63u32 << 26) | (3 << 21) | (4 << 16) | (40 << 1), 0x82000000, "fneg f3, f4"), + ((63u32 << 26) | (3 << 21) | (4 << 16) | (72 << 1), 0x82000000, "fmr f3, f4"), + // mtfsf — XFL form (Fix 1). FM at LSB bits 17-24 (PPC bits 7-14). + // Encoding: opcode 63 | FM<<17 | frB<<11 | XO=711<<1 | Rc. + ((63u32 << 26) | (0xFF << 17) | (5 << 11) | (711 << 1), 0x82000000, "mtfsf 0xFF, f5 (Rc=0)"), + ((63u32 << 26) | (0xFF << 17) | (5 << 11) | (711 << 1) | 1, 0x82000000, "mtfsf. 0xFF, f5 (Rc=1)"), + ]; + assert_or_regen("base_mnemonics.json", cases); +} + +#[test] +fn extended_mnemonics() { + let cases: &[(u32, u32, &str)] = &[ + // ori r0, r0, 0 → nop + (dform(24, 0, 0, 0), 0x82000000, "nop"), + // addi r3, r0, imm → li + (dform(14, 3, 0, 16), 0x82000000, "li r3, 16"), + (dform(14, 3, 0, -1), 0x82000000, "li r3, -1"), + // addi r3, r4, neg → subi + (dform(14, 3, 4, -16), 0x82000000, "subi r3, r4, 16"), + // addis r3, r0, imm → lis + (dform(15, 3, 0, 0x1234), 0x82000000, "lis r3, 0x1234"), + // addis r3, r4, neg → subis + (dform(15, 3, 4, -1), 0x82000000, "subis r3, r4, 0xFFFF"), + // or rA, rS, rS → mr + (xform_logic(4, 3, 4, 444, 0), 0x82000000, "mr r3, r4"), + (xform_logic(4, 3, 4, 444, 1), 0x82000000, "mr. r3, r4"), + // and rA, rS, rS → mr (also) + (xform_logic(4, 3, 4, 28, 0), 0x82000000, "mr (via and)"), + // nor rA, rS, rS → not + (xform_logic(4, 3, 4, 124, 0), 0x82000000, "not r3, r4"), + // subf → sub (operand swap) + (xform_xo3(3, 4, 5, 0, 40, 0), 0x82000000, "subf → sub r3, r5, r4"), + // rlwinm simplifications + (rlwinm(4, 3, 4, 0, 31 - 4, 0), 0x82000000, "slwi r3, r4, 4"), + (rlwinm(4, 3, 32 - 4, 4, 31, 0), 0x82000000, "srwi r3, r4, 4"), + (rlwinm(4, 3, 8, 0, 31, 0), 0x82000000, "rotlwi r3, r4, 8"), + (rlwinm(4, 3, 0, 4, 31, 0), 0x82000000, "clrlwi r3, r4, 4"), + (rlwinm(4, 3, 0, 0, 27, 0), 0x82000000, "clrrwi r3, r4, 4"), + (rlwinm(4, 3, 8, 0, 7, 0), 0x82000000, "extlwi r3, r4, 8, 8"), + // rlwinm with Rc + (rlwinm(4, 3, 4, 0, 31 - 4, 1), 0x82000000, "slwi. r3, r4, 4"), + // rlwinm Sylpheed regression + (rlwinm(11, 11, 0, 31, 31, 1), 0x82000000, "rlwinm. r11,r11,0,31,31 (no simplify)"), + // rldicl simplifications + (rldicl(4, 3, 0, 32, 0), 0x82000000, "clrldi r3, r4, 32"), + (rldicl(4, 3, 64u32 - 8, 8, 0), 0x82000000, "srdi r3, r4, 8"), + (rldicl(4, 3, 8, 0, 0), 0x82000000, "rotldi r3, r4, 8"), + // cmpi / cmpli → cmpwi/cmpdi/cmplwi/cmpldi + (dform(11, 0, 3, 16), 0x82000000, "cmpwi cr0, r3, 16"), + (dform(11, (1 << 21) | (2 << 23), 3, 16) | (1 << 21), 0x82000000, "cmpdi (L=1) variant"), + // bclr 20, 0 → blr + (xlform_bclr(20, 0, 0), 0x82000000, "blr"), + (xlform_bclr(20, 0, 1), 0x82000000, "blrl"), + // bcctr 20, 0 → bctr + (xlform_bcctr(20, 0, 0), 0x82000000, "bctr"), + (xlform_bcctr(20, 0, 1), 0x82000000, "bctrl"), + // bclr conditional + (xlform_bclr(12, 2, 0), 0x82000000, "beqlr (BO=12, BI=2 → cr0.eq true)"), + (xlform_bclr(4, 2, 0), 0x82000000, "bnelr"), + // bc with full BO/BI: branch always (BO=20) + (bform_bc(20, 0, 0x40, 0, 0), 0x82000000, "bc → b 0x82000040"), + (bform_bc(20, 0, 0x40, 0, 1), 0x82000000, "bc l → bl 0x82000040"), + // Conditional bc → beq/bne/etc + (bform_bc(12, 2, 0x40, 0, 0), 0x82000000, "bc 12,cr0.eq → beq 0x82000040"), + (bform_bc(4, 2, 0x40, 0, 0), 0x82000000, "bc 4,cr0.eq → bne 0x82000040"), + (bform_bc(12, 0, 0x40, 0, 0), 0x82000000, "bc 12,cr0.lt → blt 0x82000040"), + (bform_bc(4, 0, 0x40, 0, 0), 0x82000000, "bc 4,cr0.lt → bge 0x82000040"), + (bform_bc(12, 1, 0x40, 0, 0), 0x82000000, "bc 12,cr0.gt → bgt 0x82000040"), + (bform_bc(4, 1, 0x40, 0, 0), 0x82000000, "bc 4,cr0.gt → ble 0x82000040"), + // Conditional with non-zero CR field + (bform_bc(12, 2 + 8, 0x40, 0, 0), 0x82000000, "bc 12, cr2.eq → beq cr2, 0x...040"), + // bdnz / bdz (decrement-CTR branches) + (bform_bc(16, 0, 0x40, 0, 0), 0x82000000, "bdnz 0x82000040"), + (bform_bc(18, 0, 0x40, 0, 0), 0x82000000, "bdz 0x82000040"), + // I-form branches + (iform_b(0x40, 0, 0), 0x82000000, "b +0x40 → 0x82000040"), + (iform_b(0x40, 0, 1), 0x82000000, "bl +0x40 → 0x82000040"), + (iform_b(0x40, 1, 0), 0x82000000, "ba 0x40 absolute"), + (iform_b(0x40, 1, 1), 0x82000000, "bla 0x40 absolute"), + // Trap immediate simplifications + ((2u32 << 26) | (4 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "tdeqi r3, 123"), + ((3u32 << 26) | (16 << 21) | (3 << 16) | (123u32 & 0xFFFF), 0x82000000, "twlti r3, 123"), + // mfspr → mflr / mfctr / mfxer + (mfspr(3, 8), 0x82000000, "mflr r3"), + (mfspr(3, 9), 0x82000000, "mfctr r3"), + (mfspr(3, 1), 0x82000000, "mfxer r3"), + // mtspr → mtlr / mtctr / mtxer + (mtspr(3, 8), 0x82000000, "mtlr r3"), + (mtspr(3, 9), 0x82000000, "mtctr r3"), + (mtspr(3, 1), 0x82000000, "mtxer r3"), + // crnor with same source bits → crnot + ((19 << 26) | (4 << 21) | (5 << 16) | (5 << 11) | (33 << 1), 0x82000000, "crnot 4, 5"), + // crxor with all same → crclr + ((19 << 26) | (4 << 21) | (4 << 16) | (4 << 11) | (193 << 1), 0x82000000, "crclr 4"), + // creqv with all same → crset + ((19 << 26) | (4 << 21) | (4 << 16) | (4 << 11) | (289 << 1), 0x82000000, "crset 4"), + // cror with same source bits → crmove + ((19 << 26) | (4 << 21) | (5 << 16) | (5 << 11) | (449 << 1), 0x82000000, "crmove 4, 5"), + // sync L=1 → lwsync + ((31 << 26) | (1 << 21) | (598 << 1), 0x82000000, "lwsync"), + // tw 31, 0, 0 → trap + ((31 << 26) | (31 << 21) | (4 << 1), 0x82000000, "trap"), + // Fix 2: bclr/bcctr with BO=20 and BI≠0 still emits blr/bctr ext. + // BO=20 ignores both CTR test and CR test, so BI is don't-care. + (xlform_bclr(20, 4, 0), 0x82000000, "blr (BO=20, BI=4 — BI is don't-care)"), + (xlform_bclr(20, 7, 1), 0x82000000, "blrl (BO=20, BI=7)"), + (xlform_bcctr(20, 4, 0), 0x82000000, "bctr (BO=20, BI=4)"), + // Fix 3: trap unsigned simplified mnemonics (TO=1, 2, 5, 6 — logical + // compare conditions). Register form (tw/td) and immediate (twi/tdi). + ((31u32 << 26) | (2 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twllt r3, r4 (TO=2)"), + ((31u32 << 26) | (1 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twlgt r3, r4 (TO=1)"), + ((31u32 << 26) | (5 << 21) | (3 << 16) | (4 << 11) | (68 << 1), 0x82000000, "tdlge r3, r4 (TO=5)"), + ((31u32 << 26) | (6 << 21) | (3 << 16) | (4 << 11) | (4 << 1), 0x82000000, "twlle r3, r4 (TO=6)"), + ((3u32 << 26) | (2 << 21) | (3 << 16) | (16u32 & 0xFFFF), 0x82000000, "twllti r3, 16"), + ((2u32 << 26) | (5 << 21) | (3 << 16) | (16u32 & 0xFFFF), 0x82000000, "tdlgei r3, 16"), + ]; + assert_or_regen("extended_mnemonics.json", cases); +} + +#[test] +fn vmx128_registers() { + // Standard VMX (op=4) — 5-bit registers v0..v31. Verifies that the + // low-register path renders correctly through the new formatter. + let std_vmx = [ + // vaddubm v3, v4, v5 : op=4, 3-op key=0 + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 0, 0x82000000, "vaddubm v3, v4, v5"), + // vaddfp v3, v4, v5 : op=4, vx=10 + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 10, 0x82000000, "vaddfp v3, v4, v5"), + // vand v3, v4, v5 : vx=1028 + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1028, 0x82000000, "vand v3, v4, v5"), + // vor v3, v4, v5 : vx=1156 + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1156, 0x82000000, "vor v3, v4, v5"), + // vxor v3, v4, v5 : vx=1220 + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | 1220, 0x82000000, "vxor v3, v4, v5"), + // vsel v3, v4, v5, v6 : op=4, va_key=42 (4-op) + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 42, 0x82000000, "vsel v3,v4,v5,v6"), + // vperm v3, v4, v5, v6 : va_key=43 + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 43, 0x82000000, "vperm v3,v4,v5,v6"), + // vmaddfp v3, v4, v5, v6 : va_key=46 (operand swap: vd, va, vc, vb) + ((4u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (6 << 6) | 46, 0x82000000, "vmaddfp v3, v4, v6, v5 (swap)"), + // mfvscr v3 : vx=1540 + ((4u32 << 26) | (3 << 21) | 1540, 0x82000000, "mfvscr v3"), + // mtvscr v5 : vx=1604, vb=v5 + ((4u32 << 26) | (5 << 11) | 1604, 0x82000000, "mtvscr v5"), + ]; + + // VMX128 op=5 — uses vd128/va128/vb128 (7-bit registers, high bits at + // 21+22). These are the silent-bug-area encodings; we exercise low + // register indices here because the secondary-opcode key for op=5 + // includes bits 21-22, constraining vd128 high bits to 0 in this form. + // High-index examples for vd128 live in the op=6 series below. + let vmx128_op5 = [ + // vaddfp128 v3, v4, v5 : op=5, key2=0b000001 + ((5u32 << 26) | (3 << 21) | (4 << 16) | (5 << 11) | (0 << 6) | (1 << 0), 0x82000000, "vaddfp128 (encoded sloppily)"), + ]; + + // VMX128 op=6 — vrlimi128 has secondary key in bits 23-25 + 26-27, so + // bits 21-22 ARE the high bits of vd128 (canonical silent-bug-area). + // These instructions exercise vd128 = 32, 64, 96 — covering the bit-21 + // and bit-22 split that ppc.rs's old extractor (now deleted) miscoded. + let vrlimi128 = |vd: u32, vb: u32, imm: u32, z: u32| -> u32 { + // op=6, vd128 = bits 6-10 + bit 21 + bit 22, vb128 = bits 16-20 + bits 30+31, + // IMM = bits 11-15, Z = bits 24-25, key2 = (bits 23-25 << 4) | bits 26-27 = 0b1110001 + let vd_lo = vd & 0x1F; + let vd_b21 = (vd >> 5) & 1; + let vd_b22 = (vd >> 6) & 1; + let vb_lo = vb & 0x1F; + let vb_b30 = (vb >> 5) & 1; + let vb_b31 = (vb >> 6) & 1; + // bits 23-25 = 111, bits 26-27 = 00, bit 27 = 1 → key2 lower 4 bits = 0001 + // Encoded: bits 23-25 = 111, bits 26-27 = 00 are actually overlapping with z field (bits 24-25) + // The plan view: (bits 23 << 6) | (bits 24-25 << 4) | (bits 26-27 << 2) but the table uses different. + // Easiest: hand-encode known bit pattern matching decoder.rs's match: + // key2 = (extract_bits(code, 23, 25) << 4) | extract_bits(code, 26, 27) = 0b1110001 + // bits 23-25 = 111, bits 26-27 = 01 + // Bit positions 23-27 = 11101 (5 bits, MSB at 23). + // PPC bit 23 (LSB index 8): set + // PPC bit 24 (LSB index 7): set -- this is z bit 0 + // PPC bit 25 (LSB index 6): set -- this is z bit 1 + // PPC bit 26 (LSB index 5): unset + // PPC bit 27 (LSB index 4): set + // We let z = bits 24-25 stored with vd128 bits at 21-22. + // To preserve key2 = 0b1110001, we need bits 24-25 = 11, bit 26 = 0, bit 27 = 1. + // BUT bits 24-25 ARE the z field; if we set them = 11 the z value is 3. + // So Z is constrained for vrlimi128. Choose Z = 3 (matches Sylpheed examples). + let z3 = z & 0x3; + (6u32 << 26) + | (vd_lo << 21) + | (imm << 16) + | (vb_lo << 11) + | (vd_b21 << 10) // bit 21 (LSB pos 10) + | (vd_b22 << 9) // bit 22 (LSB pos 9) + | (1 << 8) // bit 23 + | (z3 << 6) // bits 24-25 + | (0 << 5) // bit 26 + | (1 << 4) // bit 27 + | (vb_b30 << 1) // bit 30 + | vb_b31 // bit 31 + }; + // Note: VMX128 op6 secondary keys constrain bits 21-23. For + // vrlimi128 (key2 = 0b1110001 over bits 21-23 + 26-27) the only + // valid vd128 range is 96..=127 — lower values change the secondary + // key into some other instruction. The cases below record what the + // disassembler emits for the borderline encodings, so a regression + // in either the lookup table or the formatter would surface here. + let vmx128_high = [ + // bits 21-22 = 00 → key2 ≠ vrlimi128 → decodes to vsrw128 (key5 + // branch). Locks current behavior; shows the silent-bug-area + // encoding constraint. + (vrlimi128(0, 12, 4, 3), 0x82000000, "encoding vd_hi=00: actually vsrw128"), + // bits 21-22 = 10 → still not vrlimi128. + (vrlimi128(32, 12, 4, 3), 0x82000000, "encoding vd_hi=10: actually vsrw128 v32"), + // bits 21-22 = 01 → key1 matches vpermwi128. + (vrlimi128(64, 12, 4, 3), 0x82000000, "encoding vd_hi=01: actually vpermwi128"), + // bits 21-22 = 11 → key2 matches vrlimi128 with vd128=96. + (vrlimi128(96, 12, 4, 3), 0x82000000, "vrlimi128 v96, v12, 4, 3 (real)"), + (vrlimi128(127, 127, 4, 3), 0x82000000, "vrlimi128 v127, v127, 4, 3 (real)"), + ]; + + // Fix 4: VMX128 multiply-add 4-operand layouts. Per canary, the addend + // is the VD register re-used; operand order differs between the three + // mnemonics. Encodings hand-built to satisfy decode_op5's key2 secondary + // opcode (vmaddfp128=0b001101, vmaddcfp128=0b010001, vnmsubfp128=0b010101) + // with bit 22=0 (forced by key2's high nibble) so vd128 high bit 1 = 0. + // vd128 low = 3 (bits 6-10); va128 = 3 | (bit29<<5) = 35; vb128 = 5. + // Distinct VD vs VA verifies the layout isn't trivially aliasing VD. + // + // layout (canary): + // vmaddfp128 VD, VA, VB, VD → "v3, v35, v5, v3" + // vmaddcfp128 VD, VA, VD, VB → "v3, v35, v3, v5" + // vnmsubfp128 VD, VA, VD, VB → "v3, v35, v3, v5" + let vmx128_4op = [ + // vmaddfp128: bits 24=1, 25=1, 27=1, bit 29=1 (VA high), VB=5 + (0x146028D4u32, 0x82000000, "vmaddfp128 v3, v35, v5, v3"), + // vmaddcfp128: bits 23=1, 27=1, bit 29=1, VB=5 + (0x14602914u32, 0x82000000, "vmaddcfp128 v3, v35, v3, v5"), + // vnmsubfp128: bits 23=1, 25=1, 27=1, bit 29=1, VB=5 + (0x14602954u32, 0x82000000, "vnmsubfp128 v3, v35, v3, v5"), + ]; + + let mut all = Vec::new(); + all.extend_from_slice(&std_vmx); + all.extend_from_slice(&vmx128_op5); + all.extend_from_slice(&vmx128_high); + all.extend_from_slice(&vmx128_4op); + assert_or_regen("vmx128_registers.json", &all); +} diff --git a/crates/xenia-cpu/tests/golden/base_mnemonics.json b/crates/xenia-cpu/tests/golden/base_mnemonics.json new file mode 100644 index 0000000..b0de088 --- /dev/null +++ b/crates/xenia-cpu/tests/golden/base_mnemonics.json @@ -0,0 +1,571 @@ +{ + "rows": [ + { + "label": "add r3,r4,r5", + "raw": "0x7C642A14", + "addr": "0x82000000", + "mnemonic": "add", + "operands": "r3, r4, r5" + }, + { + "label": "add. r3,r4,r5", + "raw": "0x7C642A15", + "addr": "0x82000000", + "mnemonic": "add.", + "operands": "r3, r4, r5" + }, + { + "label": "addo r3,r4,r5", + "raw": "0x7C642E14", + "addr": "0x82000000", + "mnemonic": "addo", + "operands": "r3, r4, r5" + }, + { + "label": "addo. r3,r4,r5", + "raw": "0x7C642E15", + "addr": "0x82000000", + "mnemonic": "addo.", + "operands": "r3, r4, r5" + }, + { + "label": "neg r3,r4", + "raw": "0x7C6400D0", + "addr": "0x82000000", + "mnemonic": "neg", + "operands": "r3, r4" + }, + { + "label": "mullw r3,r4,r5", + "raw": "0x7C6429D6", + "addr": "0x82000000", + "mnemonic": "mullw", + "operands": "r3, r4, r5" + }, + { + "label": "divw r3,r4,r5", + "raw": "0x7C642BD6", + "addr": "0x82000000", + "mnemonic": "divw", + "operands": "r3, r4, r5" + }, + { + "label": "mulhw. r3,r4,r5", + "raw": "0x7C642897", + "addr": "0x82000000", + "mnemonic": "mulhw.", + "operands": "r3, r4, r5" + }, + { + "label": "mulhwu. r3,r4,r5", + "raw": "0x7C642817", + "addr": "0x82000000", + "mnemonic": "mulhwu.", + "operands": "r3, r4, r5" + }, + { + "label": "mulld r3,r4,r5", + "raw": "0x7C6429D2", + "addr": "0x82000000", + "mnemonic": "mulld", + "operands": "r3, r4, r5" + }, + { + "label": "and r3,r4,r5", + "raw": "0x7C832838", + "addr": "0x82000000", + "mnemonic": "and", + "operands": "r3, r4, r5" + }, + { + "label": "or r3,r4,r5 (non-mr: rs!=rb)", + "raw": "0x7C832B78", + "addr": "0x82000000", + "mnemonic": "or", + "operands": "r3, r4, r5" + }, + { + "label": "xor r3,r4,r5", + "raw": "0x7C832A78", + "addr": "0x82000000", + "mnemonic": "xor", + "operands": "r3, r4, r5" + }, + { + "label": "nor r3,r4,r5", + "raw": "0x7C8328F8", + "addr": "0x82000000", + "mnemonic": "nor", + "operands": "r3, r4, r5" + }, + { + "label": "nand r3,r4,r5", + "raw": "0x7C832BB8", + "addr": "0x82000000", + "mnemonic": "nand", + "operands": "r3, r4, r5" + }, + { + "label": "eqv r3,r4,r5", + "raw": "0x7C832A38", + "addr": "0x82000000", + "mnemonic": "eqv", + "operands": "r3, r4, r5" + }, + { + "label": "andc r3,r4,r5", + "raw": "0x7C832878", + "addr": "0x82000000", + "mnemonic": "andc", + "operands": "r3, r4, r5" + }, + { + "label": "orc r3,r4,r5", + "raw": "0x7C832B38", + "addr": "0x82000000", + "mnemonic": "orc", + "operands": "r3, r4, r5" + }, + { + "label": "slw r3,r4,r5", + "raw": "0x7C832830", + "addr": "0x82000000", + "mnemonic": "slw", + "operands": "r3, r4, r5" + }, + { + "label": "srw r3,r4,r5", + "raw": "0x7C832C30", + "addr": "0x82000000", + "mnemonic": "srw", + "operands": "r3, r4, r5" + }, + { + "label": "sraw r3,r4,r5", + "raw": "0x7C832E30", + "addr": "0x82000000", + "mnemonic": "sraw", + "operands": "r3, r4, r5" + }, + { + "label": "sld r3,r4,r5", + "raw": "0x7C832836", + "addr": "0x82000000", + "mnemonic": "sld", + "operands": "r3, r4, r5" + }, + { + "label": "srd r3,r4,r5", + "raw": "0x7C832C36", + "addr": "0x82000000", + "mnemonic": "srd", + "operands": "r3, r4, r5" + }, + { + "label": "srawi r3,r4,16", + "raw": "0x7C838670", + "addr": "0x82000000", + "mnemonic": "srawi", + "operands": "r3, r4, 16" + }, + { + "label": "stwcx. r3,r4,r5", + "raw": "0x7C64292D", + "addr": "0x82000000", + "mnemonic": "stwcx.", + "operands": "r3, r4, r5" + }, + { + "label": "stdcx. r3,r4,r5", + "raw": "0x7C6429AD", + "addr": "0x82000000", + "mnemonic": "stdcx.", + "operands": "r3, r4, r5" + }, + { + "label": "lwarx r3,r4,r5", + "raw": "0x7C642828", + "addr": "0x82000000", + "mnemonic": "lwarx", + "operands": "r3, r4, r5" + }, + { + "label": "ldarx r3,r4,r5", + "raw": "0x7C6428A8", + "addr": "0x82000000", + "mnemonic": "ldarx", + "operands": "r3, r4, r5" + }, + { + "label": "cmpwi cr0, r3, 16", + "raw": "0x2C030010", + "addr": "0x82000000", + "mnemonic": "cmpi", + "operands": "0, r3, 16", + "ext_mnemonic": "cmpwi", + "ext_operands": "r3, 16" + }, + { + "label": "cmpwi cr2, r3, 16", + "raw": "0x2D030010", + "addr": "0x82000000", + "mnemonic": "cmpi", + "operands": "cr2, 0, r3, 16", + "ext_mnemonic": "cmpwi", + "ext_operands": "cr2, r3, 16" + }, + { + "label": "cmplwi cr0, r3, 16", + "raw": "0x28030010", + "addr": "0x82000000", + "mnemonic": "cmpli", + "operands": "0, r3, 0x10", + "ext_mnemonic": "cmplwi", + "ext_operands": "r3, 0x10" + }, + { + "label": "cmpw r3,r4 in cr0", + "raw": "0x7C032000", + "addr": "0x82000000", + "mnemonic": "cmp", + "operands": "0, r3, r4", + "ext_mnemonic": "cmpw", + "ext_operands": "r3, r4" + }, + { + "label": "cmpd r3,r4", + "raw": "0x7C232000", + "addr": "0x82000000", + "mnemonic": "cmp", + "operands": "1, r3, r4", + "ext_mnemonic": "cmpd", + "ext_operands": "r3, r4" + }, + { + "label": "cmplw r3,r4", + "raw": "0x7C032040", + "addr": "0x82000000", + "mnemonic": "cmpl", + "operands": "0, r3, r4", + "ext_mnemonic": "cmplw", + "ext_operands": "r3, r4" + }, + { + "label": "addi r3, r1, 16", + "raw": "0x38610010", + "addr": "0x82000000", + "mnemonic": "addi", + "operands": "r3, r1, 16" + }, + { + "label": "addis r3, r1, 0x100 (ra!=0)", + "raw": "0x3C610100", + "addr": "0x82000000", + "mnemonic": "addis", + "operands": "r3, r1, 0x100" + }, + { + "label": "mulli r3, r4, 5", + "raw": "0x1C640005", + "addr": "0x82000000", + "mnemonic": "mulli", + "operands": "r3, r4, 5" + }, + { + "label": "subfic r3, r4, 5", + "raw": "0x20640005", + "addr": "0x82000000", + "mnemonic": "subfic", + "operands": "r3, r4, 5" + }, + { + "label": "addic r3, r4, 16", + "raw": "0x30640010", + "addr": "0x82000000", + "mnemonic": "addic", + "operands": "r3, r4, 16" + }, + { + "label": "addic. r3, r4, 16", + "raw": "0x34640010", + "addr": "0x82000000", + "mnemonic": "addic.", + "operands": "r3, r4, 16" + }, + { + "label": "ori r4, r3, 0x10 (non-nop)", + "raw": "0x60640010", + "addr": "0x82000000", + "mnemonic": "ori", + "operands": "r4, r3, 0x10" + }, + { + "label": "oris r4, r3, 0x10", + "raw": "0x64640010", + "addr": "0x82000000", + "mnemonic": "oris", + "operands": "r4, r3, 0x10" + }, + { + "label": "xori r4, r3, 0x10", + "raw": "0x68640010", + "addr": "0x82000000", + "mnemonic": "xori", + "operands": "r4, r3, 0x10" + }, + { + "label": "andi. r4, r3, 0x10", + "raw": "0x70640010", + "addr": "0x82000000", + "mnemonic": "andi.", + "operands": "r4, r3, 0x10" + }, + { + "label": "lwz r5, 0x20(r1)", + "raw": "0x80A10020", + "addr": "0x82000000", + "mnemonic": "lwz", + "operands": "r5, 32(r1)" + }, + { + "label": "stw r5, 0x20(r1)", + "raw": "0x90A10020", + "addr": "0x82000000", + "mnemonic": "stw", + "operands": "r5, 32(r1)" + }, + { + "label": "lbz r5, 0x20(r1)", + "raw": "0x88A10020", + "addr": "0x82000000", + "mnemonic": "lbz", + "operands": "r5, 32(r1)" + }, + { + "label": "lhz r5, 0x20(r1)", + "raw": "0xA0A10020", + "addr": "0x82000000", + "mnemonic": "lhz", + "operands": "r5, 32(r1)" + }, + { + "label": "lfs f5, 0x20(r1)", + "raw": "0xC0A10020", + "addr": "0x82000000", + "mnemonic": "lfs", + "operands": "f5, 32(r1)" + }, + { + "label": "lfd f5, 0x20(r1)", + "raw": "0xC8A10020", + "addr": "0x82000000", + "mnemonic": "lfd", + "operands": "f5, 32(r1)" + }, + { + "label": "stfd f5, 0x20(r1)", + "raw": "0xD8A10020", + "addr": "0x82000000", + "mnemonic": "stfd", + "operands": "f5, 32(r1)" + }, + { + "label": "ld r5, 0x20(r1)", + "raw": "0xE8A10020", + "addr": "0x82000000", + "mnemonic": "ld", + "operands": "r5, 32(r1)" + }, + { + "label": "std r5, 0x20(r1)", + "raw": "0xF8A10020", + "addr": "0x82000000", + "mnemonic": "std", + "operands": "r5, 32(r1)" + }, + { + "label": "sync 0 (extends to sync)", + "raw": "0x7C0004AC", + "addr": "0x82000000", + "mnemonic": "sync", + "operands": "" + }, + { + "label": "isync", + "raw": "0x4C00012C", + "addr": "0x82000000", + "mnemonic": "isync", + "operands": "" + }, + { + "label": "eieio", + "raw": "0x7C0006AC", + "addr": "0x82000000", + "mnemonic": "eieio", + "operands": "" + }, + { + "label": "dcbst r1, r2", + "raw": "0x7C01106C", + "addr": "0x82000000", + "mnemonic": "dcbst", + "operands": "r1, r2" + }, + { + "label": "dcbf r1, r2", + "raw": "0x7C0110AC", + "addr": "0x82000000", + "mnemonic": "dcbf", + "operands": "r1, r2" + }, + { + "label": "dcbt r1, r2", + "raw": "0x7C01122C", + "addr": "0x82000000", + "mnemonic": "dcbt", + "operands": "r1, r2" + }, + { + "label": "dcbz r1, r2", + "raw": "0x7C0117EC", + "addr": "0x82000000", + "mnemonic": "dcbz", + "operands": "r1, r2" + }, + { + "label": "dcbz128 r1, r2", + "raw": "0x7C2117EC", + "addr": "0x82000000", + "mnemonic": "dcbz128", + "operands": "r1, r2" + }, + { + "label": "crnor 4,5,6 (no simplify)", + "raw": "0x4C853042", + "addr": "0x82000000", + "mnemonic": "crnor", + "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq" + }, + { + "label": "crand 4,5,6", + "raw": "0x4C853202", + "addr": "0x82000000", + "mnemonic": "crand", + "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq" + }, + { + "label": "cror 4,5,6 (no simplify)", + "raw": "0x4C853382", + "addr": "0x82000000", + "mnemonic": "cror", + "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+eq" + }, + { + "label": "tw 11, r3, r4 (uncommon TO)", + "raw": "0x7D632008", + "addr": "0x82000000", + "mnemonic": "tw", + "operands": "11, r3, r4" + }, + { + "label": "tdi 11, r3, 123", + "raw": "0x0963007B", + "addr": "0x82000000", + "mnemonic": "tdi", + "operands": "11, r3, 123" + }, + { + "label": "mtcrf 0xFF, r5 → mtcr", + "raw": "0x7CAFF120", + "addr": "0x82000000", + "mnemonic": "mtcrf", + "operands": "0xFF, r5", + "ext_mnemonic": "mtcr", + "ext_operands": "r5" + }, + { + "label": "mfcr r5", + "raw": "0x7CA00026", + "addr": "0x82000000", + "mnemonic": "mfcr", + "operands": "r5" + }, + { + "label": "mfmsr r5", + "raw": "0x7CA000A6", + "addr": "0x82000000", + "mnemonic": "mfmsr", + "operands": "r5" + }, + { + "label": "mtmsr r5", + "raw": "0x7CA00124", + "addr": "0x82000000", + "mnemonic": "mtmsr", + "operands": "r5" + }, + { + "label": "mtmsrd r5", + "raw": "0x7CA00164", + "addr": "0x82000000", + "mnemonic": "mtmsrd", + "operands": "r5" + }, + { + "label": "fadd f3, f4, f5", + "raw": "0xFC64282A", + "addr": "0x82000000", + "mnemonic": "fadd", + "operands": "f3, f4, f5" + }, + { + "label": "fsub f3, f4, f5", + "raw": "0xFC642828", + "addr": "0x82000000", + "mnemonic": "fsub", + "operands": "f3, f4, f5" + }, + { + "label": "fdiv f3, f4, f5", + "raw": "0xFC642824", + "addr": "0x82000000", + "mnemonic": "fdiv", + "operands": "f3, f4, f5" + }, + { + "label": "fmul f3, f0, f5 (encoded)", + "raw": "0xFCE02832", + "addr": "0x82000000", + "mnemonic": "fmul", + "operands": "f7, f0, f0" + }, + { + "label": "fneg f3, f4", + "raw": "0xFC640050", + "addr": "0x82000000", + "mnemonic": "fneg", + "operands": "f3, f0" + }, + { + "label": "fmr f3, f4", + "raw": "0xFC640090", + "addr": "0x82000000", + "mnemonic": "fmr", + "operands": "f3, f0" + }, + { + "label": "mtfsf 0xFF, f5 (Rc=0)", + "raw": "0xFDFE2D8E", + "addr": "0x82000000", + "mnemonic": "mtfsf", + "operands": "0xFF, f5" + }, + { + "label": "mtfsf. 0xFF, f5 (Rc=1)", + "raw": "0xFDFE2D8F", + "addr": "0x82000000", + "mnemonic": "mtfsf.", + "operands": "0xFF, f5" + } + ] +} diff --git a/crates/xenia-cpu/tests/golden/extended_mnemonics.json b/crates/xenia-cpu/tests/golden/extended_mnemonics.json new file mode 100644 index 0000000..d869109 --- /dev/null +++ b/crates/xenia-cpu/tests/golden/extended_mnemonics.json @@ -0,0 +1,621 @@ +{ + "rows": [ + { + "label": "nop", + "raw": "0x60000000", + "addr": "0x82000000", + "mnemonic": "ori", + "operands": "r0, r0, 0x0", + "ext_mnemonic": "nop", + "ext_operands": "" + }, + { + "label": "li r3, 16", + "raw": "0x38600010", + "addr": "0x82000000", + "mnemonic": "addi", + "operands": "r3, r0, 16", + "ext_mnemonic": "li", + "ext_operands": "r3, 16" + }, + { + "label": "li r3, -1", + "raw": "0x3860FFFF", + "addr": "0x82000000", + "mnemonic": "addi", + "operands": "r3, r0, -1", + "ext_mnemonic": "li", + "ext_operands": "r3, -1" + }, + { + "label": "subi r3, r4, 16", + "raw": "0x3864FFF0", + "addr": "0x82000000", + "mnemonic": "addi", + "operands": "r3, r4, -16", + "ext_mnemonic": "subi", + "ext_operands": "r3, r4, 16" + }, + { + "label": "lis r3, 0x1234", + "raw": "0x3C601234", + "addr": "0x82000000", + "mnemonic": "addis", + "operands": "r3, r0, 0x1234", + "ext_mnemonic": "lis", + "ext_operands": "r3, 0x1234" + }, + { + "label": "subis r3, r4, 0xFFFF", + "raw": "0x3C64FFFF", + "addr": "0x82000000", + "mnemonic": "addis", + "operands": "r3, r4, 0xFFFF", + "ext_mnemonic": "subis", + "ext_operands": "r3, r4, 0x1" + }, + { + "label": "mr r3, r4", + "raw": "0x7C832378", + "addr": "0x82000000", + "mnemonic": "or", + "operands": "r3, r4, r4", + "ext_mnemonic": "mr", + "ext_operands": "r3, r4" + }, + { + "label": "mr. r3, r4", + "raw": "0x7C832379", + "addr": "0x82000000", + "mnemonic": "or.", + "operands": "r3, r4, r4", + "ext_mnemonic": "mr.", + "ext_operands": "r3, r4" + }, + { + "label": "mr (via and)", + "raw": "0x7C832038", + "addr": "0x82000000", + "mnemonic": "and", + "operands": "r3, r4, r4", + "ext_mnemonic": "mr", + "ext_operands": "r3, r4" + }, + { + "label": "not r3, r4", + "raw": "0x7C8320F8", + "addr": "0x82000000", + "mnemonic": "nor", + "operands": "r3, r4, r4", + "ext_mnemonic": "not", + "ext_operands": "r3, r4" + }, + { + "label": "subf → sub r3, r5, r4", + "raw": "0x7C642850", + "addr": "0x82000000", + "mnemonic": "subf", + "operands": "r3, r4, r5", + "ext_mnemonic": "sub", + "ext_operands": "r3, r5, r4" + }, + { + "label": "slwi r3, r4, 4", + "raw": "0x54832036", + "addr": "0x82000000", + "mnemonic": "rlwinm", + "operands": "r3, r4, 4, 0, 27", + "ext_mnemonic": "slwi", + "ext_operands": "r3, r4, 4" + }, + { + "label": "srwi r3, r4, 4", + "raw": "0x5483E13E", + "addr": "0x82000000", + "mnemonic": "rlwinm", + "operands": "r3, r4, 28, 4, 31", + "ext_mnemonic": "srwi", + "ext_operands": "r3, r4, 4" + }, + { + "label": "rotlwi r3, r4, 8", + "raw": "0x5483403E", + "addr": "0x82000000", + "mnemonic": "rlwinm", + "operands": "r3, r4, 8, 0, 31", + "ext_mnemonic": "rotlwi", + "ext_operands": "r3, r4, 8" + }, + { + "label": "clrlwi r3, r4, 4", + "raw": "0x5483013E", + "addr": "0x82000000", + "mnemonic": "rlwinm", + "operands": "r3, r4, 0, 4, 31", + "ext_mnemonic": "clrlwi", + "ext_operands": "r3, r4, 4" + }, + { + "label": "clrrwi r3, r4, 4", + "raw": "0x54830036", + "addr": "0x82000000", + "mnemonic": "rlwinm", + "operands": "r3, r4, 0, 0, 27", + "ext_mnemonic": "clrrwi", + "ext_operands": "r3, r4, 4" + }, + { + "label": "extlwi r3, r4, 8, 8", + "raw": "0x5483400E", + "addr": "0x82000000", + "mnemonic": "rlwinm", + "operands": "r3, r4, 8, 0, 7", + "ext_mnemonic": "extlwi", + "ext_operands": "r3, r4, 8, 8" + }, + { + "label": "slwi. r3, r4, 4", + "raw": "0x54832037", + "addr": "0x82000000", + "mnemonic": "rlwinm.", + "operands": "r3, r4, 4, 0, 27", + "ext_mnemonic": "slwi.", + "ext_operands": "r3, r4, 4" + }, + { + "label": "rlwinm. r11,r11,0,31,31 (no simplify)", + "raw": "0x556B07FF", + "addr": "0x82000000", + "mnemonic": "rlwinm.", + "operands": "r11, r11, 0, 31, 31", + "ext_mnemonic": "clrlwi.", + "ext_operands": "r11, r11, 31" + }, + { + "label": "clrldi r3, r4, 32", + "raw": "0x78830020", + "addr": "0x82000000", + "mnemonic": "rldicl", + "operands": "r3, r4, 0, 32", + "ext_mnemonic": "clrldi", + "ext_operands": "r3, r4, 32" + }, + { + "label": "srdi r3, r4, 8", + "raw": "0x7883E200", + "addr": "0x82000000", + "mnemonic": "rldicl", + "operands": "r3, r4, 56, 8", + "ext_mnemonic": "srdi", + "ext_operands": "r3, r4, 8" + }, + { + "label": "rotldi r3, r4, 8", + "raw": "0x78832000", + "addr": "0x82000000", + "mnemonic": "rldicl", + "operands": "r3, r4, 8, 0", + "ext_mnemonic": "rotldi", + "ext_operands": "r3, r4, 8" + }, + { + "label": "cmpwi cr0, r3, 16", + "raw": "0x2C030010", + "addr": "0x82000000", + "mnemonic": "cmpi", + "operands": "0, r3, 16", + "ext_mnemonic": "cmpwi", + "ext_operands": "r3, 16" + }, + { + "label": "cmpdi (L=1) variant", + "raw": "0x2C230010", + "addr": "0x82000000", + "mnemonic": "cmpi", + "operands": "1, r3, 16", + "ext_mnemonic": "cmpdi", + "ext_operands": "r3, 16" + }, + { + "label": "blr", + "raw": "0x4E800020", + "addr": "0x82000000", + "mnemonic": "bclr", + "operands": "20, lt", + "ext_mnemonic": "blr", + "ext_operands": "" + }, + { + "label": "blrl", + "raw": "0x4E800021", + "addr": "0x82000000", + "mnemonic": "bclrl", + "operands": "20, lt", + "ext_mnemonic": "blrl", + "ext_operands": "" + }, + { + "label": "bctr", + "raw": "0x4E800420", + "addr": "0x82000000", + "mnemonic": "bcctr", + "operands": "20, lt", + "ext_mnemonic": "bctr", + "ext_operands": "" + }, + { + "label": "bctrl", + "raw": "0x4E800421", + "addr": "0x82000000", + "mnemonic": "bcctrl", + "operands": "20, lt", + "ext_mnemonic": "bctrl", + "ext_operands": "" + }, + { + "label": "beqlr (BO=12, BI=2 → cr0.eq true)", + "raw": "0x4D820020", + "addr": "0x82000000", + "mnemonic": "bclr", + "operands": "12, eq", + "ext_mnemonic": "beqlr", + "ext_operands": "" + }, + { + "label": "bnelr", + "raw": "0x4C820020", + "addr": "0x82000000", + "mnemonic": "bclr", + "operands": "4, eq", + "ext_mnemonic": "bnelr", + "ext_operands": "" + }, + { + "label": "bc → b 0x82000040", + "raw": "0x42800040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "20, lt, 0x82000040", + "ext_mnemonic": "b", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc l → bl 0x82000040", + "raw": "0x42800041", + "addr": "0x82000000", + "mnemonic": "bcl", + "operands": "20, lt, 0x82000040", + "ext_mnemonic": "bl", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc 12,cr0.eq → beq 0x82000040", + "raw": "0x41820040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "12, eq, 0x82000040", + "ext_mnemonic": "beq", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc 4,cr0.eq → bne 0x82000040", + "raw": "0x40820040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "4, eq, 0x82000040", + "ext_mnemonic": "bne", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc 12,cr0.lt → blt 0x82000040", + "raw": "0x41800040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "12, lt, 0x82000040", + "ext_mnemonic": "blt", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc 4,cr0.lt → bge 0x82000040", + "raw": "0x40800040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "4, lt, 0x82000040", + "ext_mnemonic": "bge", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc 12,cr0.gt → bgt 0x82000040", + "raw": "0x41810040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "12, gt, 0x82000040", + "ext_mnemonic": "bgt", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc 4,cr0.gt → ble 0x82000040", + "raw": "0x40810040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "4, gt, 0x82000040", + "ext_mnemonic": "ble", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bc 12, cr2.eq → beq cr2, 0x...040", + "raw": "0x418A0040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "12, 4*cr2+eq, 0x82000040", + "ext_mnemonic": "beq", + "ext_operands": "cr2, 0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bdnz 0x82000040", + "raw": "0x42000040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "16, lt, 0x82000040", + "ext_mnemonic": "bdnzge", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bdz 0x82000040", + "raw": "0x42400040", + "addr": "0x82000000", + "mnemonic": "bc", + "operands": "18, lt, 0x82000040", + "ext_mnemonic": "bdzge", + "ext_operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "b +0x40 → 0x82000040", + "raw": "0x48000040", + "addr": "0x82000000", + "mnemonic": "b", + "operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "bl +0x40 → 0x82000040", + "raw": "0x48000041", + "addr": "0x82000000", + "mnemonic": "bl", + "operands": "0x82000040", + "branch_target": "0x82000040" + }, + { + "label": "ba 0x40 absolute", + "raw": "0x48000042", + "addr": "0x82000000", + "mnemonic": "ba", + "operands": "0x00000040", + "branch_target": "0x00000040" + }, + { + "label": "bla 0x40 absolute", + "raw": "0x48000043", + "addr": "0x82000000", + "mnemonic": "bla", + "operands": "0x00000040", + "branch_target": "0x00000040" + }, + { + "label": "tdeqi r3, 123", + "raw": "0x0883007B", + "addr": "0x82000000", + "mnemonic": "tdi", + "operands": "4, r3, 123", + "ext_mnemonic": "tdeqi", + "ext_operands": "r3, 123" + }, + { + "label": "twlti r3, 123", + "raw": "0x0E03007B", + "addr": "0x82000000", + "mnemonic": "twi", + "operands": "16, r3, 123", + "ext_mnemonic": "twlti", + "ext_operands": "r3, 123" + }, + { + "label": "mflr r3", + "raw": "0x7C6802A6", + "addr": "0x82000000", + "mnemonic": "mfspr", + "operands": "r3, LR", + "ext_mnemonic": "mflr", + "ext_operands": "r3" + }, + { + "label": "mfctr r3", + "raw": "0x7C6902A6", + "addr": "0x82000000", + "mnemonic": "mfspr", + "operands": "r3, CTR", + "ext_mnemonic": "mfctr", + "ext_operands": "r3" + }, + { + "label": "mfxer r3", + "raw": "0x7C6102A6", + "addr": "0x82000000", + "mnemonic": "mfspr", + "operands": "r3, XER", + "ext_mnemonic": "mfxer", + "ext_operands": "r3" + }, + { + "label": "mtlr r3", + "raw": "0x7C6803A6", + "addr": "0x82000000", + "mnemonic": "mtspr", + "operands": "LR, r3", + "ext_mnemonic": "mtlr", + "ext_operands": "r3" + }, + { + "label": "mtctr r3", + "raw": "0x7C6903A6", + "addr": "0x82000000", + "mnemonic": "mtspr", + "operands": "CTR, r3", + "ext_mnemonic": "mtctr", + "ext_operands": "r3" + }, + { + "label": "mtxer r3", + "raw": "0x7C6103A6", + "addr": "0x82000000", + "mnemonic": "mtspr", + "operands": "XER, r3", + "ext_mnemonic": "mtxer", + "ext_operands": "r3" + }, + { + "label": "crnot 4, 5", + "raw": "0x4C852842", + "addr": "0x82000000", + "mnemonic": "crnor", + "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+gt", + "ext_mnemonic": "crnot", + "ext_operands": "4*cr1+lt, 4*cr1+gt" + }, + { + "label": "crclr 4", + "raw": "0x4C842182", + "addr": "0x82000000", + "mnemonic": "crxor", + "operands": "4*cr1+lt, 4*cr1+lt, 4*cr1+lt", + "ext_mnemonic": "crclr", + "ext_operands": "4*cr1+lt" + }, + { + "label": "crset 4", + "raw": "0x4C842242", + "addr": "0x82000000", + "mnemonic": "creqv", + "operands": "4*cr1+lt, 4*cr1+lt, 4*cr1+lt", + "ext_mnemonic": "crset", + "ext_operands": "4*cr1+lt" + }, + { + "label": "crmove 4, 5", + "raw": "0x4C852B82", + "addr": "0x82000000", + "mnemonic": "cror", + "operands": "4*cr1+lt, 4*cr1+gt, 4*cr1+gt", + "ext_mnemonic": "crmove", + "ext_operands": "4*cr1+lt, 4*cr1+gt" + }, + { + "label": "lwsync", + "raw": "0x7C2004AC", + "addr": "0x82000000", + "mnemonic": "sync", + "operands": "" + }, + { + "label": "trap", + "raw": "0x7FE00008", + "addr": "0x82000000", + "mnemonic": "tw", + "operands": "31, r0, r0", + "ext_mnemonic": "trap", + "ext_operands": "" + }, + { + "label": "blr (BO=20, BI=4 — BI is don't-care)", + "raw": "0x4E840020", + "addr": "0x82000000", + "mnemonic": "bclr", + "operands": "20, 4*cr1+lt", + "ext_mnemonic": "blr", + "ext_operands": "" + }, + { + "label": "blrl (BO=20, BI=7)", + "raw": "0x4E870021", + "addr": "0x82000000", + "mnemonic": "bclrl", + "operands": "20, 4*cr1+so", + "ext_mnemonic": "blrl", + "ext_operands": "" + }, + { + "label": "bctr (BO=20, BI=4)", + "raw": "0x4E840420", + "addr": "0x82000000", + "mnemonic": "bcctr", + "operands": "20, 4*cr1+lt", + "ext_mnemonic": "bctr", + "ext_operands": "" + }, + { + "label": "twllt r3, r4 (TO=2)", + "raw": "0x7C432008", + "addr": "0x82000000", + "mnemonic": "tw", + "operands": "2, r3, r4", + "ext_mnemonic": "twllt", + "ext_operands": "r3, r4" + }, + { + "label": "twlgt r3, r4 (TO=1)", + "raw": "0x7C232008", + "addr": "0x82000000", + "mnemonic": "tw", + "operands": "1, r3, r4", + "ext_mnemonic": "twlgt", + "ext_operands": "r3, r4" + }, + { + "label": "tdlge r3, r4 (TO=5)", + "raw": "0x7CA32088", + "addr": "0x82000000", + "mnemonic": "td", + "operands": "5, r3, r4", + "ext_mnemonic": "tdlge", + "ext_operands": "r3, r4" + }, + { + "label": "twlle r3, r4 (TO=6)", + "raw": "0x7CC32008", + "addr": "0x82000000", + "mnemonic": "tw", + "operands": "6, r3, r4", + "ext_mnemonic": "twlle", + "ext_operands": "r3, r4" + }, + { + "label": "twllti r3, 16", + "raw": "0x0C430010", + "addr": "0x82000000", + "mnemonic": "twi", + "operands": "2, r3, 16", + "ext_mnemonic": "twllti", + "ext_operands": "r3, 16" + }, + { + "label": "tdlgei r3, 16", + "raw": "0x08A30010", + "addr": "0x82000000", + "mnemonic": "tdi", + "operands": "5, r3, 16", + "ext_mnemonic": "tdlgei", + "ext_operands": "r3, 16" + } + ] +} diff --git a/crates/xenia-cpu/tests/golden/vmx128_registers.json b/crates/xenia-cpu/tests/golden/vmx128_registers.json new file mode 100644 index 0000000..1d072cd --- /dev/null +++ b/crates/xenia-cpu/tests/golden/vmx128_registers.json @@ -0,0 +1,137 @@ +{ + "rows": [ + { + "label": "vaddubm v3, v4, v5", + "raw": "0x10642800", + "addr": "0x82000000", + "mnemonic": "vaddubm", + "operands": "v3, v4, v5" + }, + { + "label": "vaddfp v3, v4, v5", + "raw": "0x1064280A", + "addr": "0x82000000", + "mnemonic": "vaddfp", + "operands": "v3, v4, v5" + }, + { + "label": "vand v3, v4, v5", + "raw": "0x10642C04", + "addr": "0x82000000", + "mnemonic": "vand", + "operands": "v3, v4, v5" + }, + { + "label": "vor v3, v4, v5", + "raw": "0x10642C84", + "addr": "0x82000000", + "mnemonic": "vor", + "operands": "v3, v4, v5" + }, + { + "label": "vxor v3, v4, v5", + "raw": "0x10642CC4", + "addr": "0x82000000", + "mnemonic": "vxor", + "operands": "v3, v4, v5" + }, + { + "label": "vsel v3,v4,v5,v6", + "raw": "0x106429AA", + "addr": "0x82000000", + "mnemonic": "vsel", + "operands": "v3, v4, v5, v6" + }, + { + "label": "vperm v3,v4,v5,v6", + "raw": "0x106429AB", + "addr": "0x82000000", + "mnemonic": "vperm", + "operands": "v3, v4, v5, v6" + }, + { + "label": "vmaddfp v3, v4, v6, v5 (swap)", + "raw": "0x106429AE", + "addr": "0x82000000", + "mnemonic": "vmaddfp", + "operands": "v3, v4, v6, v5" + }, + { + "label": "mfvscr v3", + "raw": "0x10600604", + "addr": "0x82000000", + "mnemonic": "mfvscr", + "operands": "v3" + }, + { + "label": "mtvscr v5", + "raw": "0x10002E44", + "addr": "0x82000000", + "mnemonic": "mtvscr", + "operands": "v5" + }, + { + "label": "vaddfp128 (encoded sloppily)", + "raw": "0x14642801", + "addr": "0x82000000", + "mnemonic": "vperm128", + "operands": "v3, v3, v5, 0" + }, + { + "label": "encoding vd_hi=00: actually vsrw128", + "raw": "0x180461D0", + "addr": "0x82000000", + "mnemonic": "vsrw128", + "operands": "v0, v0, v12" + }, + { + "label": "encoding vd_hi=10: actually vsrw128 v32", + "raw": "0x180465D0", + "addr": "0x82000000", + "mnemonic": "vsrw128", + "operands": "v32, v0, v12" + }, + { + "label": "encoding vd_hi=01: actually vpermwi128", + "raw": "0x180463D0", + "addr": "0x82000000", + "mnemonic": "vpermwi128", + "operands": "v64, v12, 0xE4" + }, + { + "label": "vrlimi128 v96, v12, 4, 3 (real)", + "raw": "0x180467D0", + "addr": "0x82000000", + "mnemonic": "vrlimi128", + "operands": "v96, v12, 4, 3" + }, + { + "label": "vrlimi128 v127, v127, 4, 3 (real)", + "raw": "0x1BE4FFD3", + "addr": "0x82000000", + "mnemonic": "vrlimi128", + "operands": "v127, v95, 4, 3" + }, + { + "label": "vmaddfp128 v3, v35, v5, v3", + "raw": "0x146028D4", + "addr": "0x82000000", + "mnemonic": "vmaddfp128", + "operands": "v3, v35, v5, v3" + }, + { + "label": "vmaddcfp128 v3, v35, v3, v5", + "raw": "0x14602914", + "addr": "0x82000000", + "mnemonic": "vmaddcfp128", + "operands": "v3, v35, v3, v5" + }, + { + "label": "vnmsubfp128 v3, v35, v3, v5", + "raw": "0x14602954", + "addr": "0x82000000", + "mnemonic": "vnmsubfp128", + "operands": "v3, v35, v3, v5" + } + ] +}