xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches

Split the monolithic interpreter into cohesive modules: dedicated
decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables
(opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs);
overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode
cache and basic-block cache (block_cache.rs); and a full VMX/VMX128
implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions.

Add the parallel-execution substrate behind --parallel: a 7-party
phaser (phaser.rs) for round-based barrier sync, ReservationTable
(reservation.rs) for guest LL/SC, and the per-HW-thread scheduler
core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs.

Disassembler is now the single source of truth: disasm.rs gains the
full base + extended + VMX128 mnemonic set, with golden JSON fixtures
and a disasm_goldens test suite. Add a criterion-style interpreter
bench. context.rs grows the per-thread state the new modules need
(reservation slot, FPSCR, vector regs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:27:43 +02:00
parent e9b2b57a44
commit c36cca14f9
20 changed files with 12284 additions and 458 deletions

View File

@@ -0,0 +1,194 @@
//! Interpreter throughput micro-benchmarks.
//!
//! Custom `harness = false` main — no extra dev-deps. Run via
//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`).
//!
//! Three workloads, each measuring `step_cached` throughput in MIPS:
//!
//! - `tight_alu_loop` — pure dispatch + ALU + decode-cache hit.
//! - `loadstore_loop` — alternating `lwz`/`stw` against main RAM. Stresses
//! every load/store path and `find_mmio` dispatch.
//! - `mmio_storm` — same shape as `loadstore_loop` but the address is
//! in a registered MMIO aperture. Sanity-checks that
//! MMIO writes still dispatch correctly.
//!
//! These are not statistically rigorous — no warmup, no variance — they're
//! just enough to detect 2x-class wins or regressions on the perf-track
//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go
//! into commit messages; there is no automated baseline file.
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;
use xenia_cpu::context::PpcContext;
use xenia_cpu::decoder::DecodeCache;
use xenia_cpu::interpreter::{step_cached, StepResult};
use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion};
use xenia_memory::page_table::MemoryProtect;
// PPC instruction encoders — minimal subset needed by the benches.
#[inline]
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
}
#[inline]
fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 {
(32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
}
#[inline]
fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 {
(36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
}
/// Set up a `GuestMemory` with one writable region for code+data.
fn make_mem(code_base: u32, code_size: u32) -> GuestMemory {
let mut mem = GuestMemory::new().expect("reserve 4GB");
mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE)
.expect("alloc bench region");
mem
}
/// Write a sequence of raw PPC instructions starting at `base`.
fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) {
for (i, &raw) in instrs.iter().enumerate() {
mem.write_u32(base + (i as u32 * 4), raw);
}
}
/// Run `total_instrs` interpreter steps over a program of length `n`,
/// wrapping PC back to `base` whenever it falls off the end. Returns the
/// elapsed wall time.
fn run_loop(
ctx: &mut PpcContext,
mem: &GuestMemory,
cache: &mut DecodeCache,
base: u32,
n: u32,
total_instrs: u64,
) -> std::time::Duration {
let end = base + n * 4;
ctx.pc = base;
let t0 = Instant::now();
for _ in 0..total_instrs {
let pv = mem.page_version(ctx.pc);
let r = step_cached(ctx, mem, cache, pv);
debug_assert!(matches!(r, StepResult::Continue));
if ctx.pc >= end {
ctx.pc = base;
}
}
t0.elapsed()
}
fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) {
let secs = elapsed.as_secs_f64();
let mips = (total_instrs as f64) / secs / 1.0e6;
println!(
"{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS",
label, total_instrs, secs, mips
);
}
fn bench_tight_alu_loop() {
const BASE: u32 = 0x1000;
const N: u32 = 256;
const TOTAL: u64 = 50_000_000;
let mut mem = make_mem(BASE, 0x1000);
// 256 × `addi r3, r3, 1` — pure register-register, no memory touch
// beyond instruction fetch.
let prog: Vec<u32> = (0..N).map(|_| enc_addi(3, 3, 1)).collect();
write_program(&mut mem, BASE, &prog);
let mut ctx = PpcContext::new();
let mut cache = DecodeCache::new();
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL);
report("tight_alu_loop", TOTAL, elapsed);
}
fn bench_loadstore_loop() {
const CODE_BASE: u32 = 0x1000;
const DATA_BASE: u32 = 0x2000;
const N: u32 = 256;
const TOTAL: u64 = 30_000_000;
let mut mem = make_mem(CODE_BASE, 0x2000);
// 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every
// load/store path through `read_u32`/`write_u32` (incl. `find_mmio`).
let mut prog = Vec::with_capacity(N as usize);
for _ in 0..(N / 2) {
prog.push(enc_stw(3, 4, 0));
prog.push(enc_lwz(5, 4, 0));
}
write_program(&mut mem, CODE_BASE, &prog);
let mut ctx = PpcContext::new();
ctx.gpr[3] = 0xDEAD_BEEF;
ctx.gpr[4] = DATA_BASE as u64;
let mut cache = DecodeCache::new();
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
report("loadstore_loop", TOTAL, elapsed);
}
fn bench_mmio_storm() {
const CODE_BASE: u32 = 0x1000;
const MMIO_BASE: u32 = 0xEA00_0000;
const N: u32 = 64;
// MMIO is slower per access — keep total smaller so the bench stays
// under a few seconds.
const TOTAL: u64 = 2_000_000;
let mut mem = make_mem(CODE_BASE, 0x1000);
let writes = Arc::new(AtomicU64::new(0));
let reads = Arc::new(AtomicU32::new(0));
let writes_clone = writes.clone();
let reads_clone = reads.clone();
mem.add_mmio_region(MmioRegion {
base_address: MMIO_BASE,
mask: 0xFFFF_0000,
size: 0x0001_0000,
read_callback: Box::new(move |_a| {
reads_clone.fetch_add(1, Ordering::Relaxed);
0
}),
write_callback: Box::new(move |_a, _v| {
writes_clone.fetch_add(1, Ordering::Relaxed);
}),
});
let mut prog = Vec::with_capacity(N as usize);
for _ in 0..(N / 2) {
prog.push(enc_stw(3, 4, 0));
prog.push(enc_lwz(5, 4, 0));
}
write_program(&mut mem, CODE_BASE, &prog);
let mut ctx = PpcContext::new();
ctx.gpr[3] = 0x1234_5678;
ctx.gpr[4] = MMIO_BASE as u64;
let mut cache = DecodeCache::new();
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
report("mmio_storm", TOTAL, elapsed);
// Sanity assertions — silently catch a refactor that breaks MMIO dispatch.
let w = writes.load(Ordering::Relaxed);
let r = reads.load(Ordering::Relaxed);
assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched");
assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched");
}
fn main() {
println!("xenia-cpu interpreter bench");
println!(" build: {}", if cfg!(debug_assertions) { "debug" } else { "release" });
bench_tight_alu_loop();
bench_loadstore_loop();
bench_mmio_storm();
}