Files
xenia-rs/crates/xenia-cpu/src/block_cache.rs
MechaCat02 c36cca14f9 xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches
Split the monolithic interpreter into cohesive modules: dedicated
decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables
(opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs);
overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode
cache and basic-block cache (block_cache.rs); and a full VMX/VMX128
implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions.

Add the parallel-execution substrate behind --parallel: a 7-party
phaser (phaser.rs) for round-based barrier sync, ReservationTable
(reservation.rs) for guest LL/SC, and the per-HW-thread scheduler
core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs.

Disassembler is now the single source of truth: disasm.rs gains the
full base + extended + VMX128 mnemonic set, with golden JSON fixtures
and a disasm_goldens test suite. Add a criterion-style interpreter
bench. context.rs grows the per-thread state the new modules need
(reservation slot, FPSCR, vector regs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:27:43 +02:00

424 lines
16 KiB
Rust

//! Tier-4 perf — basic-block cache for the PPC interpreter.
//!
//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction
//! per slot, indexed by PC. The hot loop still pays the per-instruction
//! cost of fetching the raw word, hashing the PC into a slot, and
//! comparing tags. For straight-line code — common in the asset/inflate
//! loops where Sylpheed boot is currently CPU-bound — the savings of
//! batching N decoded instructions per slot lookup are linear in block
//! length.
//!
//! ## Shape
//!
//! A `DecodedBlock` is a contiguous run of decoded instructions starting
//! at `start_pc`, ending at the first *block terminator* (any branch,
//! `sc`, trap, or `Invalid`) or at one of two safety limits:
//!
//! - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost.
//! - 4 KiB page boundary stop. A block is fully contained inside a
//! single 4 KiB guest page; that means `mem.page_version(start_pc)`
//! is sufficient to detect any code-page rewrite that should
//! invalidate the block. Without this rule the cache would have to
//! walk every spanned page on every hit, which would erase the win.
//!
//! ## Invalidation
//!
//! Each block stamps the page version at build time. On lookup, if
//! `mem.page_version(start_pc)` differs from `block.page_version`, the
//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at
//! block granularity.
//!
//! ## Debugger semantics
//!
//! Block dispatch is **opt-in** by the caller. The hot loop in
//! `xenia-app/src/main.rs` selects the per-instruction path whenever
//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set.
//! That's how single-step, breakpoints, in-memory trace, instruction
//! trace, and branch trace continue to observe every PC: the block
//! cache simply never runs in those modes.
use crate::decoder::{decode, DecodedInstr};
use xenia_memory::MemoryAccess;
/// Direct-mapped block-cache slot count. Same shape as
/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the
/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the
/// slot collision rate is negligible.
const BLOCK_CACHE_SIZE: usize = 1 << 16;
const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32;
/// Hard cap on instructions per block. Keeps the worst-case memory
/// footprint bounded and limits the rebuild cost when a code page
/// gets bumped. 32 instructions is generous for most basic blocks
/// (real-world average across Sylpheed boot is ~6 between branches).
pub const MAX_BLOCK_INSTRS: usize = 32;
/// Guest page size — duplicated here to avoid pulling
/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync
/// with the memory crate. Both refer to the architectural PowerPC 4 KiB
/// page granule, so this constant is locked.
const GUEST_PAGE_SIZE: u32 = 4096;
const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1);
/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock`
/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and
/// stays valid until the next `lookup_or_build` on the same slot.
#[derive(Debug)]
pub struct DecodedBlock {
/// Guest PC at which this block starts. Used as the slot tag.
pub start_pc: u32,
/// Guest PC immediately after the last instruction in `instrs`.
/// Equal to `instrs.last().addr + 4` whether or not the block
/// ended on a terminator. Useful for tracing / disassembly.
pub end_pc: u32,
/// `mem.page_version(start_pc)` at build time. Mismatch on lookup
/// invalidates the block. Single value because every block is
/// page-bounded by construction.
pub page_version: u64,
/// Decoded instructions in execution order. Always non-empty after
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
/// pushes the first decoded word unconditionally).
pub instrs: Vec<DecodedInstr>,
}
/// Per-slot status from a `lookup_or_build` probe. Internal only.
enum CacheStatus {
/// Block at this slot matches `pc` and the page version at build
/// time matches `mem.page_version(pc)` — return as-is.
Hit,
/// Block at this slot matched `pc` but the page version has
/// advanced — rebuild and bump `invalidations`.
Stale,
/// Slot is empty or holds a block keyed at a different `start_pc`.
/// Build a fresh block and bump `misses`.
Miss,
}
/// Direct-mapped block cache. One instance shared across all HW slots
/// (block contents are PC-only and read-only after fill). Not
/// thread-safe — owner is the single scheduler thread, same as
/// `DecodeCache`.
pub struct BlockCache {
slots: Box<[Option<Box<DecodedBlock>>]>,
hits: u64,
misses: u64,
invalidations: u64,
}
impl Default for BlockCache {
fn default() -> Self {
Self::new()
}
}
impl BlockCache {
pub fn new() -> Self {
// `Option<Box<T>>` is a niche-optimized 8-byte slot; 64 K of
// them cost ~512 KiB of cold storage. Live blocks beyond that
// sit on the heap.
let mut v: Vec<Option<Box<DecodedBlock>>> = Vec::with_capacity(BLOCK_CACHE_SIZE);
v.resize_with(BLOCK_CACHE_SIZE, || None);
Self {
slots: v.into_boxed_slice(),
hits: 0,
misses: 0,
invalidations: 0,
}
}
pub fn hits(&self) -> u64 {
self.hits
}
pub fn misses(&self) -> u64 {
self.misses
}
pub fn invalidations(&self) -> u64 {
self.invalidations
}
/// Return the cached block starting at `pc`, building it if absent
/// or stale. The returned reference is borrowed from the cache and
/// stays valid until the next `lookup_or_build` call.
pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock {
let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize;
let cur_pv = mem.page_version(pc);
// Phase 1: classify the slot. Borrow ends before fill so the
// mutable update below doesn't conflict.
let status = match &self.slots[idx] {
Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit,
Some(b) if b.start_pc == pc => CacheStatus::Stale,
_ => CacheStatus::Miss,
};
// Phase 2: fill on miss/stale, account.
match status {
CacheStatus::Hit => {
self.hits += 1;
}
CacheStatus::Stale => {
self.invalidations += 1;
self.misses += 1;
let block = build_block(pc, mem, cur_pv);
self.slots[idx] = Some(Box::new(block));
}
CacheStatus::Miss => {
self.misses += 1;
let block = build_block(pc, mem, cur_pv);
self.slots[idx] = Some(Box::new(block));
}
}
// Slot is guaranteed populated at this point — Hit returned a
// pre-existing block, Miss/Stale just wrote a new one.
self.slots[idx]
.as_deref()
.expect("block freshly built or hit")
}
}
/// Walk forward from `pc`, decoding instructions and collecting them
/// into a `DecodedBlock`. The walk stops on the first of:
/// - a [`PpcOpcode::terminates_block`] true (the terminator IS
/// included as the last instruction),
/// - reaching [`MAX_BLOCK_INSTRS`],
/// - the next PC would cross a 4 KiB guest page boundary.
fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock {
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
let page_base = start_pc & GUEST_PAGE_MASK;
let mut cur = start_pc;
loop {
let raw = mem.read_u32(cur);
let decoded = decode(raw, cur);
let terminates = decoded.opcode.terminates_block();
instrs.push(decoded);
if terminates {
break;
}
if instrs.len() >= MAX_BLOCK_INSTRS {
break;
}
let next = cur.wrapping_add(4);
if (next & GUEST_PAGE_MASK) != page_base {
break;
}
cur = next;
}
let last = instrs.last().expect("build pushes at least one instruction");
let end_pc = last.addr.wrapping_add(4);
DecodedBlock {
start_pc,
end_pc,
page_version,
instrs,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::opcode::PpcOpcode;
use std::cell::Cell;
/// 64 KiB byte-array memory, big-endian word reads.
/// Mirrors `interpreter::tests::TestMem` but lives here so block_cache
/// tests don't depend on interpreter internals.
struct BlockTestMem {
data: Box<[Cell<u8>]>,
version_a: u64,
version_b: u64,
// Address of the page whose version is `version_b` instead of
// `version_a`. Used to model an out-of-band page-version bump in
// the invalidation test without going through write_*.
bumped_page: Cell<Option<u32>>,
}
impl BlockTestMem {
fn new() -> Self {
Self {
data: (0..0x10000u32).map(|_| Cell::new(0)).collect(),
version_a: 1,
version_b: 2,
bumped_page: Cell::new(None),
}
}
fn put(&self, addr: u32, raw: u32) {
let a = addr as usize;
for (i, byte) in raw.to_be_bytes().iter().enumerate() {
self.data[a + i].set(*byte);
}
}
}
impl MemoryAccess for BlockTestMem {
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
fn read_u16(&self, a: u32) -> u16 {
let i = a as usize;
u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()])
}
fn read_u32(&self, a: u32) -> u32 {
let i = a as usize;
u32::from_be_bytes([
self.data[i].get(), self.data[i + 1].get(),
self.data[i + 2].get(), self.data[i + 3].get(),
])
}
fn read_u64(&self, a: u32) -> u64 {
let i = a as usize;
u64::from_be_bytes([
self.data[i].get(), self.data[i + 1].get(),
self.data[i + 2].get(), self.data[i + 3].get(),
self.data[i + 4].get(), self.data[i + 5].get(),
self.data[i + 6].get(), self.data[i + 7].get(),
])
}
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
fn write_u16(&self, a: u32, v: u16) {
let i = a as usize;
let b = v.to_be_bytes();
self.data[i].set(b[0]);
self.data[i + 1].set(b[1]);
}
fn write_u32(&self, a: u32, v: u32) {
let i = a as usize;
for (k, byte) in v.to_be_bytes().iter().enumerate() {
self.data[i + k].set(*byte);
}
}
fn write_u64(&self, a: u32, v: u64) {
let i = a as usize;
for (k, byte) in v.to_be_bytes().iter().enumerate() {
self.data[i + k].set(*byte);
}
}
fn translate(&self, _: u32) -> Option<*const u8> { None }
fn translate_mut(&self, _: u32) -> Option<*mut u8> { None }
fn page_version(&self, addr: u32) -> u64 {
if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() {
self.version_b
} else {
self.version_a
}
}
}
// PPC encodings — minimal subset for these tests.
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
}
fn enc_b_self() -> u32 {
// b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0.
18 << 26
}
fn enc_unimplemented() -> u32 {
// Use opcode 0 raw = 0; decoder maps to Invalid.
0
}
#[test]
fn block_built_to_terminator() {
let mem = BlockTestMem::new();
mem.put(0x100, enc_addi(3, 3, 1));
mem.put(0x104, enc_addi(3, 3, 1));
mem.put(0x108, enc_addi(3, 3, 1));
mem.put(0x10C, enc_b_self()); // terminator
let mut bc = BlockCache::new();
let b = bc.lookup_or_build(0x100, &mem);
assert_eq!(b.start_pc, 0x100);
assert_eq!(b.instrs.len(), 4);
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx);
assert_eq!(b.end_pc, 0x110);
}
#[test]
fn block_stops_at_page_boundary() {
// Build from 0x1FFC. The next PC (0x2000) is in a different
// 4 KiB page — block must contain only the one instruction.
let mem = BlockTestMem::new();
mem.put(0x1FFC, enc_addi(3, 3, 1));
mem.put(0x2000, enc_addi(3, 3, 1));
let mut bc = BlockCache::new();
let b = bc.lookup_or_build(0x1FFC, &mem);
assert_eq!(b.instrs.len(), 1);
assert_eq!(b.end_pc, 0x2000);
}
#[test]
fn block_stops_at_max_len() {
// 64 consecutive non-terminator instructions on one page —
// block must clamp at MAX_BLOCK_INSTRS.
let mem = BlockTestMem::new();
for i in 0..64u32 {
mem.put(0x100 + i * 4, enc_addi(3, 3, 1));
}
let mut bc = BlockCache::new();
let b = bc.lookup_or_build(0x100, &mem);
assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS);
assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4);
}
#[test]
fn block_stops_at_invalid_opcode() {
// Decoder mapping `Invalid` is treated as a block terminator
// so the per-instruction Unimplemented path is preserved.
let mem = BlockTestMem::new();
mem.put(0x100, enc_addi(3, 3, 1));
mem.put(0x104, enc_unimplemented());
mem.put(0x108, enc_addi(3, 3, 1));
let mut bc = BlockCache::new();
let b = bc.lookup_or_build(0x100, &mem);
assert_eq!(b.instrs.len(), 2);
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid);
}
#[test]
fn block_invalidates_on_page_version_bump() {
let mem = BlockTestMem::new();
mem.put(0x100, enc_addi(3, 3, 1));
mem.put(0x104, enc_b_self());
let mut bc = BlockCache::new();
let _ = bc.lookup_or_build(0x100, &mem);
assert_eq!(bc.misses(), 1);
assert_eq!(bc.hits(), 0);
// Same call → hit.
let _ = bc.lookup_or_build(0x100, &mem);
assert_eq!(bc.hits(), 1);
assert_eq!(bc.invalidations(), 0);
// Bump the page version on the page containing 0x100. Next
// lookup must invalidate and rebuild.
mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK));
let _ = bc.lookup_or_build(0x100, &mem);
assert_eq!(bc.invalidations(), 1);
assert_eq!(bc.misses(), 2);
}
#[test]
fn block_hit_returns_same_contents() {
// Sanity: cache hit returns a block whose contents reflect the
// ORIGINAL instruction stream, even after a non-version-bumping
// poke to the underlying bytes. (No real workload would do
// this, but it confirms we're returning cached data, not
// re-reading.)
let mem = BlockTestMem::new();
mem.put(0x100, enc_addi(3, 3, 7));
mem.put(0x104, enc_b_self());
let mut bc = BlockCache::new();
let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
// Rewrite without bumping version (test-only path).
let bytes = enc_addi(3, 3, 99).to_be_bytes();
for (i, b) in bytes.iter().enumerate() {
mem.data[0x100 + i].set(*b);
}
let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
assert_eq!(first_simm, 7);
assert_eq!(cached_simm, 7, "cache must serve original decoded form");
}
}