//! Interpreter throughput micro-benchmarks. //! //! Custom `harness = false` main — no extra dev-deps. Run via //! `cargo bench -p xenia-cpu` (or `cargo run --release --bench interpreter`). //! //! Three workloads, each measuring `step_cached` throughput in MIPS: //! //! - `tight_alu_loop` — pure dispatch + ALU + decode-cache hit. //! - `loadstore_loop` — alternating `lwz`/`stw` against main RAM. Stresses //! every load/store path and `find_mmio` dispatch. //! - `mmio_storm` — same shape as `loadstore_loop` but the address is //! in a registered MMIO aperture. Sanity-checks that //! MMIO writes still dispatch correctly. //! //! These are not statistically rigorous — no warmup, no variance — they're //! just enough to detect 2x-class wins or regressions on the perf-track //! changes (MMIO fast-reject, threaded dispatch, block cache). Numbers go //! into commit messages; there is no automated baseline file. use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::time::Instant; use xenia_cpu::context::PpcContext; use xenia_cpu::decoder::DecodeCache; use xenia_cpu::interpreter::{step_cached, StepResult}; use xenia_memory::{GuestMemory, MemoryAccess, MmioRegion}; use xenia_memory::page_table::MemoryProtect; // PPC instruction encoders — minimal subset needed by the benches. #[inline] fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 { (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32) } #[inline] fn enc_lwz(rd: u32, ra: u32, d: i16) -> u32 { (32 << 26) | (rd << 21) | (ra << 16) | (d as u16 as u32) } #[inline] fn enc_stw(rs: u32, ra: u32, d: i16) -> u32 { (36 << 26) | (rs << 21) | (ra << 16) | (d as u16 as u32) } /// Set up a `GuestMemory` with one writable region for code+data. fn make_mem(code_base: u32, code_size: u32) -> GuestMemory { let mut mem = GuestMemory::new().expect("reserve 4GB"); mem.alloc(code_base, code_size, MemoryProtect::READ | MemoryProtect::WRITE) .expect("alloc bench region"); mem } /// Write a sequence of raw PPC instructions starting at `base`. fn write_program(mem: &GuestMemory, base: u32, instrs: &[u32]) { for (i, &raw) in instrs.iter().enumerate() { mem.write_u32(base + (i as u32 * 4), raw); } } /// Run `total_instrs` interpreter steps over a program of length `n`, /// wrapping PC back to `base` whenever it falls off the end. Returns the /// elapsed wall time. fn run_loop( ctx: &mut PpcContext, mem: &GuestMemory, cache: &mut DecodeCache, base: u32, n: u32, total_instrs: u64, ) -> std::time::Duration { let end = base + n * 4; ctx.pc = base; let t0 = Instant::now(); for _ in 0..total_instrs { let pv = mem.page_version(ctx.pc); let r = step_cached(ctx, mem, cache, pv); debug_assert!(matches!(r, StepResult::Continue)); if ctx.pc >= end { ctx.pc = base; } } t0.elapsed() } fn report(label: &str, total_instrs: u64, elapsed: std::time::Duration) { let secs = elapsed.as_secs_f64(); let mips = (total_instrs as f64) / secs / 1.0e6; println!( "{:<24} {:>12} instrs in {:>7.3}s = {:>7.2} MIPS", label, total_instrs, secs, mips ); } fn bench_tight_alu_loop() { const BASE: u32 = 0x1000; const N: u32 = 256; const TOTAL: u64 = 50_000_000; let mut mem = make_mem(BASE, 0x1000); // 256 × `addi r3, r3, 1` — pure register-register, no memory touch // beyond instruction fetch. let prog: Vec = (0..N).map(|_| enc_addi(3, 3, 1)).collect(); write_program(&mut mem, BASE, &prog); let mut ctx = PpcContext::new(); let mut cache = DecodeCache::new(); let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, BASE, N, TOTAL); report("tight_alu_loop", TOTAL, elapsed); } fn bench_loadstore_loop() { const CODE_BASE: u32 = 0x1000; const DATA_BASE: u32 = 0x2000; const N: u32 = 256; const TOTAL: u64 = 30_000_000; let mut mem = make_mem(CODE_BASE, 0x2000); // 128 pairs of `stw r3, 0(r4); lwz r5, 0(r4)` — exercises every // load/store path through `read_u32`/`write_u32` (incl. `find_mmio`). let mut prog = Vec::with_capacity(N as usize); for _ in 0..(N / 2) { prog.push(enc_stw(3, 4, 0)); prog.push(enc_lwz(5, 4, 0)); } write_program(&mut mem, CODE_BASE, &prog); let mut ctx = PpcContext::new(); ctx.gpr[3] = 0xDEAD_BEEF; ctx.gpr[4] = DATA_BASE as u64; let mut cache = DecodeCache::new(); let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL); report("loadstore_loop", TOTAL, elapsed); } fn bench_mmio_storm() { const CODE_BASE: u32 = 0x1000; const MMIO_BASE: u32 = 0xEA00_0000; const N: u32 = 64; // MMIO is slower per access — keep total smaller so the bench stays // under a few seconds. const TOTAL: u64 = 2_000_000; let mut mem = make_mem(CODE_BASE, 0x1000); let writes = Arc::new(AtomicU64::new(0)); let reads = Arc::new(AtomicU32::new(0)); let writes_clone = writes.clone(); let reads_clone = reads.clone(); mem.add_mmio_region(MmioRegion { base_address: MMIO_BASE, mask: 0xFFFF_0000, size: 0x0001_0000, read_callback: Box::new(move |_a| { reads_clone.fetch_add(1, Ordering::Relaxed); 0 }), write_callback: Box::new(move |_a, _v| { writes_clone.fetch_add(1, Ordering::Relaxed); }), }); let mut prog = Vec::with_capacity(N as usize); for _ in 0..(N / 2) { prog.push(enc_stw(3, 4, 0)); prog.push(enc_lwz(5, 4, 0)); } write_program(&mut mem, CODE_BASE, &prog); let mut ctx = PpcContext::new(); ctx.gpr[3] = 0x1234_5678; ctx.gpr[4] = MMIO_BASE as u64; let mut cache = DecodeCache::new(); let elapsed = run_loop(&mut ctx, &mut mem, &mut cache, CODE_BASE, N, TOTAL); report("mmio_storm", TOTAL, elapsed); // Sanity assertions — silently catch a refactor that breaks MMIO dispatch. let w = writes.load(Ordering::Relaxed); let r = reads.load(Ordering::Relaxed); assert_eq!(w, TOTAL / 2, "expected MMIO writes to be dispatched"); assert_eq!(r as u64, TOTAL / 2, "expected MMIO reads to be dispatched"); } fn main() { println!("xenia-cpu interpreter bench"); println!(" build: {}", if cfg!(debug_assertions) { "debug" } else { "release" }); bench_tight_alu_loop(); bench_loadstore_loop(); bench_mmio_storm(); }