Files
xenia-rs/crates/xenia-cpu/benches/interpreter.rs
MechaCat02 c36cca14f9 xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches
Split the monolithic interpreter into cohesive modules: dedicated
decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables
(opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs);
overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode
cache and basic-block cache (block_cache.rs); and a full VMX/VMX128
implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions.

Add the parallel-execution substrate behind --parallel: a 7-party
phaser (phaser.rs) for round-based barrier sync, ReservationTable
(reservation.rs) for guest LL/SC, and the per-HW-thread scheduler
core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs.

Disassembler is now the single source of truth: disasm.rs gains the
full base + extended + VMX128 mnemonic set, with golden JSON fixtures
and a disasm_goldens test suite. Add a criterion-style interpreter
bench. context.rs grows the per-thread state the new modules need
(reservation slot, FPSCR, vector regs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:27:43 +02:00

195 lines
6.5 KiB
Rust
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Interpreter throughput micro-benchmarks.
//!
//! Custom `harness = false` main — no extra dev-deps. Run via
//! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`).
//!
//! Three workloads, each measuring `step_cached` throughput in MIPS:
//!
//! - `tight_alu_loop` — pure dispatch + ALU + decode-cache hit.
//! - `loadstore_loop` — alternating `lwz`/`stw` against main RAM. Stresses
//! every load/store path and `find_mmio` dispatch.
//! - `mmio_storm` — same shape as `loadstore_loop` but the address is
//! in a registered MMIO aperture. Sanity-checks that
//! MMIO writes still dispatch correctly.
//!
//! These are not statistically rigorous — no warmup, no variance — they're
//! just enough to detect 2x-class wins or regressions on the perf-track
//! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go
//! into commit messages; there is no automated baseline file.
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;
use xenia_cpu::context::PpcContext;
use xenia_cpu::decoder::DecodeCache;
use xenia_cpu::interpreter::{step_cached, StepResult};
use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion};
use xenia_memory::page_table::MemoryProtect;
// PPC instruction encoders — minimal subset needed by the benches.
#[inline]
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
}
#[inline]
fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 {
(32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32)
}
#[inline]
fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 {
(36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32)
}
/// Set up a `GuestMemory` with one writable region for code+data.
fn make_mem(code_base: u32, code_size: u32) -> GuestMemory {
let mut mem = GuestMemory::new().expect("reserve 4GB");
mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE)
.expect("alloc bench region");
mem
}
/// Write a sequence of raw PPC instructions starting at `base`.
fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) {
for (i, &raw) in instrs.iter().enumerate() {
mem.write_u32(base + (i as u32 * 4), raw);
}
}
/// Run `total_instrs` interpreter steps over a program of length `n`,
/// wrapping PC back to `base` whenever it falls off the end. Returns the
/// elapsed wall time.
fn run_loop(
ctx: &mut PpcContext,
mem: &GuestMemory,
cache: &mut DecodeCache,
base: u32,
n: u32,
total_instrs: u64,
) -> std::time::Duration {
let end = base + n * 4;
ctx.pc = base;
let t0 = Instant::now();
for _ in 0..total_instrs {
let pv = mem.page_version(ctx.pc);
let r = step_cached(ctx, mem, cache, pv);
debug_assert!(matches!(r, StepResult::Continue));
if ctx.pc >= end {
ctx.pc = base;
}
}
t0.elapsed()
}
fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) {
let secs = elapsed.as_secs_f64();
let mips = (total_instrs as f64) / secs / 1.0e6;
println!(
"{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS",
label, total_instrs, secs, mips
);
}
fn bench_tight_alu_loop() {
const BASE: u32 = 0x1000;
const N: u32 = 256;
const TOTAL: u64 = 50_000_000;
let mut mem = make_mem(BASE, 0x1000);
// 256 × `addi r3, r3, 1` — pure register-register, no memory touch
// beyond instruction fetch.
let prog: Vec<u32> = (0..N).map(|_| enc_addi(3, 3, 1)).collect();
write_program(&mut mem, BASE, &prog);
let mut ctx = PpcContext::new();
let mut cache = DecodeCache::new();
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL);
report("tight_alu_loop", TOTAL, elapsed);
}
fn bench_loadstore_loop() {
const CODE_BASE: u32 = 0x1000;
const DATA_BASE: u32 = 0x2000;
const N: u32 = 256;
const TOTAL: u64 = 30_000_000;
let mut mem = make_mem(CODE_BASE, 0x2000);
// 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every
// load/store path through `read_u32`/`write_u32` (incl. `find_mmio`).
let mut prog = Vec::with_capacity(N as usize);
for _ in 0..(N / 2) {
prog.push(enc_stw(3, 4, 0));
prog.push(enc_lwz(5, 4, 0));
}
write_program(&mut mem, CODE_BASE, &prog);
let mut ctx = PpcContext::new();
ctx.gpr[3] = 0xDEAD_BEEF;
ctx.gpr[4] = DATA_BASE as u64;
let mut cache = DecodeCache::new();
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
report("loadstore_loop", TOTAL, elapsed);
}
fn bench_mmio_storm() {
const CODE_BASE: u32 = 0x1000;
const MMIO_BASE: u32 = 0xEA00_0000;
const N: u32 = 64;
// MMIO is slower per access — keep total smaller so the bench stays
// under a few seconds.
const TOTAL: u64 = 2_000_000;
let mut mem = make_mem(CODE_BASE, 0x1000);
let writes = Arc::new(AtomicU64::new(0));
let reads = Arc::new(AtomicU32::new(0));
let writes_clone = writes.clone();
let reads_clone = reads.clone();
mem.add_mmio_region(MmioRegion {
base_address: MMIO_BASE,
mask: 0xFFFF_0000,
size: 0x0001_0000,
read_callback: Box::new(move |_a| {
reads_clone.fetch_add(1, Ordering::Relaxed);
0
}),
write_callback: Box::new(move |_a, _v| {
writes_clone.fetch_add(1, Ordering::Relaxed);
}),
});
let mut prog = Vec::with_capacity(N as usize);
for _ in 0..(N / 2) {
prog.push(enc_stw(3, 4, 0));
prog.push(enc_lwz(5, 4, 0));
}
write_program(&mut mem, CODE_BASE, &prog);
let mut ctx = PpcContext::new();
ctx.gpr[3] = 0x1234_5678;
ctx.gpr[4] = MMIO_BASE as u64;
let mut cache = DecodeCache::new();
let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL);
report("mmio_storm", TOTAL, elapsed);
// Sanity assertions — silently catch a refactor that breaks MMIO dispatch.
let w = writes.load(Ordering::Relaxed);
let r = reads.load(Ordering::Relaxed);
assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched");
assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched");
}
fn main() {
println!("xenia-cpu interpreter bench");
println!(" build: {}", if cfg!(debug_assertions) { "debug" } else { "release" });
bench_tight_alu_loop();
bench_loadstore_loop();
bench_mmio_storm();
}