xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches
Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
194
crates/xenia-cpu/benches/interpreter.rs
Normal file
194
crates/xenia-cpu/benches/interpreter.rs
Normal file
@@ -0,0 +1,194 @@
|
||||
//! Interpreter throughput micro-benchmarks.
|
||||
//!
|
||||
//! Custom `harness = false` main — no extra dev-deps. Run via
|
||||
//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`).
|
||||
//!
|
||||
//! Three workloads, each measuring `step_cached` throughput in MIPS:
|
||||
//!
|
||||
//! - `tight_alu_loop` — pure dispatch + ALU + decode-cache hit.
|
||||
//! - `loadstore_loop` — alternating `lwz`/`stw` against main RAM. Stresses
|
||||
//! every load/store path and `find_mmio` dispatch.
|
||||
//! - `mmio_storm` — same shape as `loadstore_loop` but the address is
|
||||
//! in a registered MMIO aperture. Sanity-checks that
|
||||
//! MMIO writes still dispatch correctly.
|
||||
//!
|
||||
//! These are not statistically rigorous — no warmup, no variance — they're
|
||||
//! just enough to detect 2x-class wins or regressions on the perf-track
|
||||
//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go
|
||||
//! into commit messages; there is no automated baseline file.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use xenia_cpu::context::PpcContext;
|
||||
use xenia_cpu::decoder::DecodeCache;
|
||||
use xenia_cpu::interpreter::{step_cached, StepResult};
|
||||
use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion};
|
||||
use xenia_memory::page_table::MemoryProtect;
|
||||
|
||||
// PPC instruction encoders — minimal subset needed by the benches.
|
||||
|
||||
#[inline]
|
||||
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
|
||||
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 {
|
||||
(32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 {
|
||||
(36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
|
||||
}
|
||||
|
||||
/// Set up a `GuestMemory` with one writable region for code+data.
|
||||
fn make_mem(code_base: u32, code_size: u32) -> GuestMemory {
|
||||
let mut mem = GuestMemory::new().expect("reserve 4GB");
|
||||
mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE)
|
||||
.expect("alloc bench region");
|
||||
mem
|
||||
}
|
||||
|
||||
/// Write a sequence of raw PPC instructions starting at `base`.
|
||||
fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) {
|
||||
for (i, &raw) in instrs.iter().enumerate() {
|
||||
mem.write_u32(base + (i as u32 * 4), raw);
|
||||
}
|
||||
}
|
||||
|
||||
/// Run `total_instrs` interpreter steps over a program of length `n`,
|
||||
/// wrapping PC back to `base` whenever it falls off the end. Returns the
|
||||
/// elapsed wall time.
|
||||
fn run_loop(
|
||||
ctx: &mut PpcContext,
|
||||
mem: &GuestMemory,
|
||||
cache: &mut DecodeCache,
|
||||
base: u32,
|
||||
n: u32,
|
||||
total_instrs: u64,
|
||||
) -> std::time::Duration {
|
||||
let end = base + n * 4;
|
||||
ctx.pc = base;
|
||||
let t0 = Instant::now();
|
||||
for _ in 0..total_instrs {
|
||||
let pv = mem.page_version(ctx.pc);
|
||||
let r = step_cached(ctx, mem, cache, pv);
|
||||
debug_assert!(matches!(r, StepResult::Continue));
|
||||
if ctx.pc >= end {
|
||||
ctx.pc = base;
|
||||
}
|
||||
}
|
||||
t0.elapsed()
|
||||
}
|
||||
|
||||
fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) {
|
||||
let secs = elapsed.as_secs_f64();
|
||||
let mips = (total_instrs as f64) / secs / 1.0e6;
|
||||
println!(
|
||||
"{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS",
|
||||
label, total_instrs, secs, mips
|
||||
);
|
||||
}
|
||||
|
||||
fn bench_tight_alu_loop() {
|
||||
const BASE: u32 = 0x1000;
|
||||
const N: u32 = 256;
|
||||
const TOTAL: u64 = 50_000_000;
|
||||
|
||||
let mut mem = make_mem(BASE, 0x1000);
|
||||
// 256 × `addi r3, r3, 1` — pure register-register, no memory touch
|
||||
// beyond instruction fetch.
|
||||
let prog: Vec<u32> = (0..N).map(|_| enc_addi(3, 3, 1)).collect();
|
||||
write_program(&mut mem, BASE, &prog);
|
||||
|
||||
let mut ctx = PpcContext::new();
|
||||
let mut cache = DecodeCache::new();
|
||||
|
||||
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL);
|
||||
report("tight_alu_loop", TOTAL, elapsed);
|
||||
}
|
||||
|
||||
fn bench_loadstore_loop() {
|
||||
const CODE_BASE: u32 = 0x1000;
|
||||
const DATA_BASE: u32 = 0x2000;
|
||||
const N: u32 = 256;
|
||||
const TOTAL: u64 = 30_000_000;
|
||||
|
||||
let mut mem = make_mem(CODE_BASE, 0x2000);
|
||||
// 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every
|
||||
// load/store path through `read_u32`/`write_u32` (incl. `find_mmio`).
|
||||
let mut prog = Vec::with_capacity(N as usize);
|
||||
for _ in 0..(N / 2) {
|
||||
prog.push(enc_stw(3, 4, 0));
|
||||
prog.push(enc_lwz(5, 4, 0));
|
||||
}
|
||||
write_program(&mut mem, CODE_BASE, &prog);
|
||||
|
||||
let mut ctx = PpcContext::new();
|
||||
ctx.gpr[3] = 0xDEAD_BEEF;
|
||||
ctx.gpr[4] = DATA_BASE as u64;
|
||||
let mut cache = DecodeCache::new();
|
||||
|
||||
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
|
||||
report("loadstore_loop", TOTAL, elapsed);
|
||||
}
|
||||
|
||||
fn bench_mmio_storm() {
|
||||
const CODE_BASE: u32 = 0x1000;
|
||||
const MMIO_BASE: u32 = 0xEA00_0000;
|
||||
const N: u32 = 64;
|
||||
// MMIO is slower per access — keep total smaller so the bench stays
|
||||
// under a few seconds.
|
||||
const TOTAL: u64 = 2_000_000;
|
||||
|
||||
let mut mem = make_mem(CODE_BASE, 0x1000);
|
||||
|
||||
let writes = Arc::new(AtomicU64::new(0));
|
||||
let reads = Arc::new(AtomicU32::new(0));
|
||||
let writes_clone = writes.clone();
|
||||
let reads_clone = reads.clone();
|
||||
mem.add_mmio_region(MmioRegion {
|
||||
base_address: MMIO_BASE,
|
||||
mask: 0xFFFF_0000,
|
||||
size: 0x0001_0000,
|
||||
read_callback: Box::new(move |_a| {
|
||||
reads_clone.fetch_add(1, Ordering::Relaxed);
|
||||
0
|
||||
}),
|
||||
write_callback: Box::new(move |_a, _v| {
|
||||
writes_clone.fetch_add(1, Ordering::Relaxed);
|
||||
}),
|
||||
});
|
||||
|
||||
let mut prog = Vec::with_capacity(N as usize);
|
||||
for _ in 0..(N / 2) {
|
||||
prog.push(enc_stw(3, 4, 0));
|
||||
prog.push(enc_lwz(5, 4, 0));
|
||||
}
|
||||
write_program(&mut mem, CODE_BASE, &prog);
|
||||
|
||||
let mut ctx = PpcContext::new();
|
||||
ctx.gpr[3] = 0x1234_5678;
|
||||
ctx.gpr[4] = MMIO_BASE as u64;
|
||||
let mut cache = DecodeCache::new();
|
||||
|
||||
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
|
||||
report("mmio_storm", TOTAL, elapsed);
|
||||
|
||||
// Sanity assertions — silently catch a refactor that breaks MMIO dispatch.
|
||||
let w = writes.load(Ordering::Relaxed);
|
||||
let r = reads.load(Ordering::Relaxed);
|
||||
assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched");
|
||||
assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched");
|
||||
}
|
||||
|
||||
fn main() {
|
||||
println!("xenia-cpu interpreter bench");
|
||||
println!(" build: {}", if cfg!(debug_assertions) { "debug" } else { "release" });
|
||||
bench_tight_alu_loop();
|
||||
bench_loadstore_loop();
|
||||
bench_mmio_storm();
|
||||
}
|
||||
Reference in New Issue
Block a user