xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches

Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:27:43 +02:00
parent e9b2b57a44
commit c36cca14f9
20 changed files with 12284 additions and 458 deletions
--- a/crates/xenia-cpu/benches/interpreter.rs
+++ b/crates/xenia-cpu/benches/interpreter.rs
@@ -0,0 +1,194 @@
+//! Interpreter throughput micro-benchmarks.
+//!
+//! Custom `harness = false` main — no extra dev-deps. Run via
+//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`).
+//!
+//! Three workloads, each measuring `step_cached` throughput in MIPS:
+//!
+//!   - `tight_alu_loop`  — pure dispatch + ALU + decode-cache hit.
+//!   - `loadstore_loop`  — alternating `lwz`/`stw` against main RAM. Stresses
+//!                         every load/store path and `find_mmio` dispatch.
+//!   - `mmio_storm`      — same shape as `loadstore_loop` but the address is
+//!                         in a registered MMIO aperture. Sanity-checks that
+//!                         MMIO writes still dispatch correctly.
+//!
+//! These are not statistically rigorous — no warmup, no variance — they're
+//! just enough to detect 2x-class wins or regressions on the perf-track
+//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go
+//! into commit messages; there is no automated baseline file.
+
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::Instant;
+
+use xenia_cpu::context::PpcContext;
+use xenia_cpu::decoder::DecodeCache;
+use xenia_cpu::interpreter::{step_cached, StepResult};
+use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion};
+use xenia_memory::page_table::MemoryProtect;
+
+// PPC instruction encoders — minimal subset needed by the benches.
+
+#[inline]
+fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
+    (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
+}
+
+#[inline]
+fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 {
+    (32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
+}
+
+#[inline]
+fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 {
+    (36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
+}
+
+/// Set up a `GuestMemory` with one writable region for code+data.
+fn make_mem(code_base: u32, code_size: u32) -> GuestMemory {
+    let mut mem = GuestMemory::new().expect("reserve 4GB");
+    mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE)
+        .expect("alloc bench region");
+    mem
+}
+
+/// Write a sequence of raw PPC instructions starting at `base`.
+fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) {
+    for (i, &raw) in instrs.iter().enumerate() {
+        mem.write_u32(base + (i as u32 * 4), raw);
+    }
+}
+
+/// Run `total_instrs` interpreter steps over a program of length `n`,
+/// wrapping PC back to `base` whenever it falls off the end. Returns the
+/// elapsed wall time.
+fn run_loop(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    cache: &mut DecodeCache,
+    base: u32,
+    n: u32,
+    total_instrs: u64,
+) -> std::time::Duration {
+    let end = base + n * 4;
+    ctx.pc = base;
+    let t0 = Instant::now();
+    for _ in 0..total_instrs {
+        let pv = mem.page_version(ctx.pc);
+        let r = step_cached(ctx, mem, cache, pv);
+        debug_assert!(matches!(r, StepResult::Continue));
+        if ctx.pc >= end {
+            ctx.pc = base;
+        }
+    }
+    t0.elapsed()
+}
+
+fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) {
+    let secs = elapsed.as_secs_f64();
+    let mips = (total_instrs as f64) / secs / 1.0e6;
+    println!(
+        "{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS",
+        label, total_instrs, secs, mips
+    );
+}
+
+fn bench_tight_alu_loop() {
+    const BASE: u32 = 0x1000;
+    const N: u32 = 256;
+    const TOTAL: u64 = 50_000_000;
+
+    let mut mem = make_mem(BASE, 0x1000);
+    // 256 × `addi r3, r3, 1` — pure register-register, no memory touch
+    // beyond instruction fetch.
+    let prog: Vec<u32> = (0..N).map(|_| enc_addi(3, 3, 1)).collect();
+    write_program(&mut mem, BASE, &prog);
+
+    let mut ctx = PpcContext::new();
+    let mut cache = DecodeCache::new();
+
+    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL);
+    report("tight_alu_loop", TOTAL, elapsed);
+}
+
+fn bench_loadstore_loop() {
+    const CODE_BASE: u32 = 0x1000;
+    const DATA_BASE: u32 = 0x2000;
+    const N: u32 = 256;
+    const TOTAL: u64 = 30_000_000;
+
+    let mut mem = make_mem(CODE_BASE, 0x2000);
+    // 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every
+    // load/store path through `read_u32`/`write_u32` (incl. `find_mmio`).
+    let mut prog = Vec::with_capacity(N as usize);
+    for _ in 0..(N / 2) {
+        prog.push(enc_stw(3, 4, 0));
+        prog.push(enc_lwz(5, 4, 0));
+    }
+    write_program(&mut mem, CODE_BASE, &prog);
+
+    let mut ctx = PpcContext::new();
+    ctx.gpr[3] = 0xDEAD_BEEF;
+    ctx.gpr[4] = DATA_BASE as u64;
+    let mut cache = DecodeCache::new();
+
+    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
+    report("loadstore_loop", TOTAL, elapsed);
+}
+
+fn bench_mmio_storm() {
+    const CODE_BASE: u32 = 0x1000;
+    const MMIO_BASE: u32 = 0xEA00_0000;
+    const N: u32 = 64;
+    // MMIO is slower per access — keep total smaller so the bench stays
+    // under a few seconds.
+    const TOTAL: u64 = 2_000_000;
+
+    let mut mem = make_mem(CODE_BASE, 0x1000);
+
+    let writes = Arc::new(AtomicU64::new(0));
+    let reads = Arc::new(AtomicU32::new(0));
+    let writes_clone = writes.clone();
+    let reads_clone = reads.clone();
+    mem.add_mmio_region(MmioRegion {
+        base_address: MMIO_BASE,
+        mask: 0xFFFF_0000,
+        size: 0x0001_0000,
+        read_callback: Box::new(move |_a| {
+            reads_clone.fetch_add(1, Ordering::Relaxed);
+            0
+        }),
+        write_callback: Box::new(move |_a, _v| {
+            writes_clone.fetch_add(1, Ordering::Relaxed);
+        }),
+    });
+
+    let mut prog = Vec::with_capacity(N as usize);
+    for _ in 0..(N / 2) {
+        prog.push(enc_stw(3, 4, 0));
+        prog.push(enc_lwz(5, 4, 0));
+    }
+    write_program(&mut mem, CODE_BASE, &prog);
+
+    let mut ctx = PpcContext::new();
+    ctx.gpr[3] = 0x1234_5678;
+    ctx.gpr[4] = MMIO_BASE as u64;
+    let mut cache = DecodeCache::new();
+
+    let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
+    report("mmio_storm", TOTAL, elapsed);
+
+    // Sanity assertions — silently catch a refactor that breaks MMIO dispatch.
+    let w = writes.load(Ordering::Relaxed);
+    let r = reads.load(Ordering::Relaxed);
+    assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched");
+    assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched");
+}
+
+fn main() {
+    println!("xenia-cpu interpreter bench");
+    println!("  build: {}", if cfg!(debug_assertions) { "debug" } else { "release" });
+    bench_tight_alu_loop();
+    bench_loadstore_loop();
+    bench_mmio_storm();
+}