xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches
Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
423
crates/xenia-cpu/src/block_cache.rs
Normal file
423
crates/xenia-cpu/src/block_cache.rs
Normal file
@@ -0,0 +1,423 @@
|
||||
//! Tier-4 perf — basic-block cache for the PPC interpreter.
|
||||
//!
|
||||
//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction
|
||||
//! per slot, indexed by PC. The hot loop still pays the per-instruction
|
||||
//! cost of fetching the raw word, hashing the PC into a slot, and
|
||||
//! comparing tags. For straight-line code — common in the asset/inflate
|
||||
//! loops where Sylpheed boot is currently CPU-bound — the savings of
|
||||
//! batching N decoded instructions per slot lookup are linear in block
|
||||
//! length.
|
||||
//!
|
||||
//! ## Shape
|
||||
//!
|
||||
//! A `DecodedBlock` is a contiguous run of decoded instructions starting
|
||||
//! at `start_pc`, ending at the first *block terminator* (any branch,
|
||||
//! `sc`, trap, or `Invalid`) or at one of two safety limits:
|
||||
//!
|
||||
//! - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost.
|
||||
//! - 4 KiB page boundary stop. A block is fully contained inside a
|
||||
//! single 4 KiB guest page; that means `mem.page_version(start_pc)`
|
||||
//! is sufficient to detect any code-page rewrite that should
|
||||
//! invalidate the block. Without this rule the cache would have to
|
||||
//! walk every spanned page on every hit, which would erase the win.
|
||||
//!
|
||||
//! ## Invalidation
|
||||
//!
|
||||
//! Each block stamps the page version at build time. On lookup, if
|
||||
//! `mem.page_version(start_pc)` differs from `block.page_version`, the
|
||||
//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at
|
||||
//! block granularity.
|
||||
//!
|
||||
//! ## Debugger semantics
|
||||
//!
|
||||
//! Block dispatch is **opt-in** by the caller. The hot loop in
|
||||
//! `xenia-app/src/main.rs` selects the per-instruction path whenever
|
||||
//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set.
|
||||
//! That's how single-step, breakpoints, in-memory trace, instruction
|
||||
//! trace, and branch trace continue to observe every PC: the block
|
||||
//! cache simply never runs in those modes.
|
||||
|
||||
use crate::decoder::{decode, DecodedInstr};
|
||||
use xenia_memory::MemoryAccess;
|
||||
|
||||
/// Direct-mapped block-cache slot count. Same shape as
|
||||
/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the
|
||||
/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the
|
||||
/// slot collision rate is negligible.
|
||||
const BLOCK_CACHE_SIZE: usize = 1 << 16;
|
||||
const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32;
|
||||
|
||||
/// Hard cap on instructions per block. Keeps the worst-case memory
|
||||
/// footprint bounded and limits the rebuild cost when a code page
|
||||
/// gets bumped. 32 instructions is generous for most basic blocks
|
||||
/// (real-world average across Sylpheed boot is ~6 between branches).
|
||||
pub const MAX_BLOCK_INSTRS: usize = 32;
|
||||
|
||||
/// Guest page size — duplicated here to avoid pulling
|
||||
/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync
|
||||
/// with the memory crate. Both refer to the architectural PowerPC 4 KiB
|
||||
/// page granule, so this constant is locked.
|
||||
const GUEST_PAGE_SIZE: u32 = 4096;
|
||||
const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1);
|
||||
|
||||
/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock`
|
||||
/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and
|
||||
/// stays valid until the next `lookup_or_build` on the same slot.
|
||||
#[derive(Debug)]
|
||||
pub struct DecodedBlock {
|
||||
/// Guest PC at which this block starts. Used as the slot tag.
|
||||
pub start_pc: u32,
|
||||
/// Guest PC immediately after the last instruction in `instrs`.
|
||||
/// Equal to `instrs.last().addr + 4` whether or not the block
|
||||
/// ended on a terminator. Useful for tracing / disassembly.
|
||||
pub end_pc: u32,
|
||||
/// `mem.page_version(start_pc)` at build time. Mismatch on lookup
|
||||
/// invalidates the block. Single value because every block is
|
||||
/// page-bounded by construction.
|
||||
pub page_version: u64,
|
||||
/// Decoded instructions in execution order. Always non-empty after
|
||||
/// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
|
||||
/// pushes the first decoded word unconditionally).
|
||||
pub instrs: Vec<DecodedInstr>,
|
||||
}
|
||||
|
||||
/// Per-slot status from a `lookup_or_build` probe. Internal only.
|
||||
enum CacheStatus {
|
||||
/// Block at this slot matches `pc` and the page version at build
|
||||
/// time matches `mem.page_version(pc)` — return as-is.
|
||||
Hit,
|
||||
/// Block at this slot matched `pc` but the page version has
|
||||
/// advanced — rebuild and bump `invalidations`.
|
||||
Stale,
|
||||
/// Slot is empty or holds a block keyed at a different `start_pc`.
|
||||
/// Build a fresh block and bump `misses`.
|
||||
Miss,
|
||||
}
|
||||
|
||||
/// Direct-mapped block cache. One instance shared across all HW slots
|
||||
/// (block contents are PC-only and read-only after fill). Not
|
||||
/// thread-safe — owner is the single scheduler thread, same as
|
||||
/// `DecodeCache`.
|
||||
pub struct BlockCache {
|
||||
slots: Box<[Option<Box<DecodedBlock>>]>,
|
||||
hits: u64,
|
||||
misses: u64,
|
||||
invalidations: u64,
|
||||
}
|
||||
|
||||
impl Default for BlockCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockCache {
|
||||
pub fn new() -> Self {
|
||||
// `Option<Box<T>>` is a niche-optimized 8-byte slot; 64 K of
|
||||
// them cost ~512 KiB of cold storage. Live blocks beyond that
|
||||
// sit on the heap.
|
||||
let mut v: Vec<Option<Box<DecodedBlock>>> = Vec::with_capacity(BLOCK_CACHE_SIZE);
|
||||
v.resize_with(BLOCK_CACHE_SIZE, || None);
|
||||
Self {
|
||||
slots: v.into_boxed_slice(),
|
||||
hits: 0,
|
||||
misses: 0,
|
||||
invalidations: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn hits(&self) -> u64 {
|
||||
self.hits
|
||||
}
|
||||
pub fn misses(&self) -> u64 {
|
||||
self.misses
|
||||
}
|
||||
pub fn invalidations(&self) -> u64 {
|
||||
self.invalidations
|
||||
}
|
||||
|
||||
/// Return the cached block starting at `pc`, building it if absent
|
||||
/// or stale. The returned reference is borrowed from the cache and
|
||||
/// stays valid until the next `lookup_or_build` call.
|
||||
pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock {
|
||||
let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize;
|
||||
let cur_pv = mem.page_version(pc);
|
||||
|
||||
// Phase 1: classify the slot. Borrow ends before fill so the
|
||||
// mutable update below doesn't conflict.
|
||||
let status = match &self.slots[idx] {
|
||||
Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit,
|
||||
Some(b) if b.start_pc == pc => CacheStatus::Stale,
|
||||
_ => CacheStatus::Miss,
|
||||
};
|
||||
|
||||
// Phase 2: fill on miss/stale, account.
|
||||
match status {
|
||||
CacheStatus::Hit => {
|
||||
self.hits += 1;
|
||||
}
|
||||
CacheStatus::Stale => {
|
||||
self.invalidations += 1;
|
||||
self.misses += 1;
|
||||
let block = build_block(pc, mem, cur_pv);
|
||||
self.slots[idx] = Some(Box::new(block));
|
||||
}
|
||||
CacheStatus::Miss => {
|
||||
self.misses += 1;
|
||||
let block = build_block(pc, mem, cur_pv);
|
||||
self.slots[idx] = Some(Box::new(block));
|
||||
}
|
||||
}
|
||||
|
||||
// Slot is guaranteed populated at this point — Hit returned a
|
||||
// pre-existing block, Miss/Stale just wrote a new one.
|
||||
self.slots[idx]
|
||||
.as_deref()
|
||||
.expect("block freshly built or hit")
|
||||
}
|
||||
}
|
||||
|
||||
/// Walk forward from `pc`, decoding instructions and collecting them
|
||||
/// into a `DecodedBlock`. The walk stops on the first of:
|
||||
/// - a [`PpcOpcode::terminates_block`] true (the terminator IS
|
||||
/// included as the last instruction),
|
||||
/// - reaching [`MAX_BLOCK_INSTRS`],
|
||||
/// - the next PC would cross a 4 KiB guest page boundary.
|
||||
fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock {
|
||||
let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
|
||||
let page_base = start_pc & GUEST_PAGE_MASK;
|
||||
let mut cur = start_pc;
|
||||
|
||||
loop {
|
||||
let raw = mem.read_u32(cur);
|
||||
let decoded = decode(raw, cur);
|
||||
let terminates = decoded.opcode.terminates_block();
|
||||
instrs.push(decoded);
|
||||
|
||||
if terminates {
|
||||
break;
|
||||
}
|
||||
if instrs.len() >= MAX_BLOCK_INSTRS {
|
||||
break;
|
||||
}
|
||||
let next = cur.wrapping_add(4);
|
||||
if (next & GUEST_PAGE_MASK) != page_base {
|
||||
break;
|
||||
}
|
||||
cur = next;
|
||||
}
|
||||
|
||||
let last = instrs.last().expect("build pushes at least one instruction");
|
||||
let end_pc = last.addr.wrapping_add(4);
|
||||
|
||||
DecodedBlock {
|
||||
start_pc,
|
||||
end_pc,
|
||||
page_version,
|
||||
instrs,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::opcode::PpcOpcode;
|
||||
|
||||
use std::cell::Cell;
|
||||
|
||||
/// 64 KiB byte-array memory, big-endian word reads.
|
||||
/// Mirrors `interpreter::tests::TestMem` but lives here so block_cache
|
||||
/// tests don't depend on interpreter internals.
|
||||
struct BlockTestMem {
|
||||
data: Box<[Cell<u8>]>,
|
||||
version_a: u64,
|
||||
version_b: u64,
|
||||
// Address of the page whose version is `version_b` instead of
|
||||
// `version_a`. Used to model an out-of-band page-version bump in
|
||||
// the invalidation test without going through write_*.
|
||||
bumped_page: Cell<Option<u32>>,
|
||||
}
|
||||
|
||||
impl BlockTestMem {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
data: (0..0x10000u32).map(|_| Cell::new(0)).collect(),
|
||||
version_a: 1,
|
||||
version_b: 2,
|
||||
bumped_page: Cell::new(None),
|
||||
}
|
||||
}
|
||||
fn put(&self, addr: u32, raw: u32) {
|
||||
let a = addr as usize;
|
||||
for (i, byte) in raw.to_be_bytes().iter().enumerate() {
|
||||
self.data[a + i].set(*byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MemoryAccess for BlockTestMem {
|
||||
fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
|
||||
fn read_u16(&self, a: u32) -> u16 {
|
||||
let i = a as usize;
|
||||
u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()])
|
||||
}
|
||||
fn read_u32(&self, a: u32) -> u32 {
|
||||
let i = a as usize;
|
||||
u32::from_be_bytes([
|
||||
self.data[i].get(), self.data[i + 1].get(),
|
||||
self.data[i + 2].get(), self.data[i + 3].get(),
|
||||
])
|
||||
}
|
||||
fn read_u64(&self, a: u32) -> u64 {
|
||||
let i = a as usize;
|
||||
u64::from_be_bytes([
|
||||
self.data[i].get(), self.data[i + 1].get(),
|
||||
self.data[i + 2].get(), self.data[i + 3].get(),
|
||||
self.data[i + 4].get(), self.data[i + 5].get(),
|
||||
self.data[i + 6].get(), self.data[i + 7].get(),
|
||||
])
|
||||
}
|
||||
fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
|
||||
fn write_u16(&self, a: u32, v: u16) {
|
||||
let i = a as usize;
|
||||
let b = v.to_be_bytes();
|
||||
self.data[i].set(b[0]);
|
||||
self.data[i + 1].set(b[1]);
|
||||
}
|
||||
fn write_u32(&self, a: u32, v: u32) {
|
||||
let i = a as usize;
|
||||
for (k, byte) in v.to_be_bytes().iter().enumerate() {
|
||||
self.data[i + k].set(*byte);
|
||||
}
|
||||
}
|
||||
fn write_u64(&self, a: u32, v: u64) {
|
||||
let i = a as usize;
|
||||
for (k, byte) in v.to_be_bytes().iter().enumerate() {
|
||||
self.data[i + k].set(*byte);
|
||||
}
|
||||
}
|
||||
fn translate(&self, _: u32) -> Option<*const u8> { None }
|
||||
fn translate_mut(&self, _: u32) -> Option<*mut u8> { None }
|
||||
fn page_version(&self, addr: u32) -> u64 {
|
||||
if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() {
|
||||
self.version_b
|
||||
} else {
|
||||
self.version_a
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PPC encodings — minimal subset for these tests.
|
||||
fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
|
||||
(14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
|
||||
}
|
||||
fn enc_b_self() -> u32 {
|
||||
// b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0.
|
||||
18 << 26
|
||||
}
|
||||
fn enc_unimplemented() -> u32 {
|
||||
// Use opcode 0 raw = 0; decoder maps to Invalid.
|
||||
0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_built_to_terminator() {
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_addi(3, 3, 1));
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
mem.put(0x10C, enc_b_self()); // terminator
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.start_pc, 0x100);
|
||||
assert_eq!(b.instrs.len(), 4);
|
||||
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx);
|
||||
assert_eq!(b.end_pc, 0x110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_page_boundary() {
|
||||
// Build from 0x1FFC. The next PC (0x2000) is in a different
|
||||
// 4 KiB page — block must contain only the one instruction.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x1FFC, enc_addi(3, 3, 1));
|
||||
mem.put(0x2000, enc_addi(3, 3, 1));
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x1FFC, &mem);
|
||||
assert_eq!(b.instrs.len(), 1);
|
||||
assert_eq!(b.end_pc, 0x2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_max_len() {
|
||||
// 64 consecutive non-terminator instructions on one page —
|
||||
// block must clamp at MAX_BLOCK_INSTRS.
|
||||
let mem = BlockTestMem::new();
|
||||
for i in 0..64u32 {
|
||||
mem.put(0x100 + i * 4, enc_addi(3, 3, 1));
|
||||
}
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS);
|
||||
assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_stops_at_invalid_opcode() {
|
||||
// Decoder mapping `Invalid` is treated as a block terminator
|
||||
// so the per-instruction Unimplemented path is preserved.
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_unimplemented());
|
||||
mem.put(0x108, enc_addi(3, 3, 1));
|
||||
let mut bc = BlockCache::new();
|
||||
let b = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(b.instrs.len(), 2);
|
||||
assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_invalidates_on_page_version_bump() {
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 1));
|
||||
mem.put(0x104, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.misses(), 1);
|
||||
assert_eq!(bc.hits(), 0);
|
||||
|
||||
// Same call → hit.
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.hits(), 1);
|
||||
assert_eq!(bc.invalidations(), 0);
|
||||
|
||||
// Bump the page version on the page containing 0x100. Next
|
||||
// lookup must invalidate and rebuild.
|
||||
mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK));
|
||||
let _ = bc.lookup_or_build(0x100, &mem);
|
||||
assert_eq!(bc.invalidations(), 1);
|
||||
assert_eq!(bc.misses(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn block_hit_returns_same_contents() {
|
||||
// Sanity: cache hit returns a block whose contents reflect the
|
||||
// ORIGINAL instruction stream, even after a non-version-bumping
|
||||
// poke to the underlying bytes. (No real workload would do
|
||||
// this, but it confirms we're returning cached data, not
|
||||
// re-reading.)
|
||||
let mem = BlockTestMem::new();
|
||||
mem.put(0x100, enc_addi(3, 3, 7));
|
||||
mem.put(0x104, enc_b_self());
|
||||
let mut bc = BlockCache::new();
|
||||
let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
|
||||
// Rewrite without bumping version (test-only path).
|
||||
let bytes = enc_addi(3, 3, 99).to_be_bytes();
|
||||
for (i, b) in bytes.iter().enumerate() {
|
||||
mem.data[0x100 + i].set(*b);
|
||||
}
|
||||
let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
|
||||
assert_eq!(first_simm, 7);
|
||||
assert_eq!(cached_simm, 7, "cache must serve original decoded form");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user