//! Tier-4 perf — basic-block cache for the PPC interpreter. //! //! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction //! per slot, indexed by PC. The hot loop still pays the per-instruction //! cost of fetching the raw word, hashing the PC into a slot, and //! comparing tags. For straight-line code — common in the asset/inflate //! loops where Sylpheed boot is currently CPU-bound — the savings of //! batching N decoded instructions per slot lookup are linear in block //! length. //! //! ## Shape //! //! A `DecodedBlock` is a contiguous run of decoded instructions starting //! at `start_pc`, ending at the first *block terminator* (any branch, //! `sc`, trap, or `Invalid`) or at one of two safety limits: //! //! - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost. //! - 4 KiB page boundary stop. A block is fully contained inside a //! single 4 KiB guest page; that means `mem.page_version(start_pc)` //! is sufficient to detect any code-page rewrite that should //! invalidate the block. Without this rule the cache would have to //! walk every spanned page on every hit, which would erase the win. //! //! ## Invalidation //! //! Each block stamps the page version at build time. On lookup, if //! `mem.page_version(start_pc)` differs from `block.page_version`, the //! slot is rebuilt. Same mechanism `DecodeCache` uses, just at //! block granularity. //! //! ## Debugger semantics //! //! Block dispatch is **opt-in** by the caller. The hot loop in //! `xenia-app/src/main.rs` selects the per-instruction path whenever //! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set. //! That's how single-step, breakpoints, in-memory trace, instruction //! trace, and branch trace continue to observe every PC: the block //! cache simply never runs in those modes. use crate::decoder::{decode, DecodedInstr}; use xenia_memory::MemoryAccess; /// Direct-mapped block-cache slot count. Same shape as /// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the /// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the /// slot collision rate is negligible. const BLOCK_CACHE_SIZE: usize = 1 << 16; const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32; /// Hard cap on instructions per block. Keeps the worst-case memory /// footprint bounded and limits the rebuild cost when a code page /// gets bumped. 32 instructions is generous for most basic blocks /// (real-world average across Sylpheed boot is ~6 between branches). pub const MAX_BLOCK_INSTRS: usize = 32; /// Guest page size — duplicated here to avoid pulling /// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync /// with the memory crate. Both refer to the architectural PowerPC 4 KiB /// page granule, so this constant is locked. const GUEST_PAGE_SIZE: u32 = 4096; const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1); /// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock` /// is handed to the interpreter via [`BlockCache::lookup_or_build`] and /// stays valid until the next `lookup_or_build` on the same slot. #[derive(Debug)] pub struct DecodedBlock { /// Guest PC at which this block starts. Used as the slot tag. pub start_pc: u32, /// Guest PC immediately after the last instruction in `instrs`. /// Equal to `instrs.last().addr + 4` whether or not the block /// ended on a terminator. Useful for tracing / disassembly. pub end_pc: u32, /// `mem.page_version(start_pc)` at build time. Mismatch on lookup /// invalidates the block. Single value because every block is /// page-bounded by construction. pub page_version: u64, /// Decoded instructions in execution order. Always non-empty after /// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk /// pushes the first decoded word unconditionally). pub instrs: Vec, } /// Per-slot status from a `lookup_or_build` probe. Internal only. enum CacheStatus { /// Block at this slot matches `pc` and the page version at build /// time matches `mem.page_version(pc)` — return as-is. Hit, /// Block at this slot matched `pc` but the page version has /// advanced — rebuild and bump `invalidations`. Stale, /// Slot is empty or holds a block keyed at a different `start_pc`. /// Build a fresh block and bump `misses`. Miss, } /// Direct-mapped block cache. One instance shared across all HW slots /// (block contents are PC-only and read-only after fill). Not /// thread-safe — owner is the single scheduler thread, same as /// `DecodeCache`. pub struct BlockCache { slots: Box<[Option>]>, hits: u64, misses: u64, invalidations: u64, } impl Default for BlockCache { fn default() -> Self { Self::new() } } impl BlockCache { pub fn new() -> Self { // `Option>` is a niche-optimized 8-byte slot; 64 K of // them cost ~512 KiB of cold storage. Live blocks beyond that // sit on the heap. let mut v: Vec>> = Vec::with_capacity(BLOCK_CACHE_SIZE); v.resize_with(BLOCK_CACHE_SIZE, || None); Self { slots: v.into_boxed_slice(), hits: 0, misses: 0, invalidations: 0, } } pub fn hits(&self) -> u64 { self.hits } pub fn misses(&self) -> u64 { self.misses } pub fn invalidations(&self) -> u64 { self.invalidations } /// Return the cached block starting at `pc`, building it if absent /// or stale. The returned reference is borrowed from the cache and /// stays valid until the next `lookup_or_build` call. pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock { let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize; let cur_pv = mem.page_version(pc); // Phase 1: classify the slot. Borrow ends before fill so the // mutable update below doesn't conflict. let status = match &self.slots[idx] { Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit, Some(b) if b.start_pc == pc => CacheStatus::Stale, _ => CacheStatus::Miss, }; // Phase 2: fill on miss/stale, account. match status { CacheStatus::Hit => { self.hits += 1; } CacheStatus::Stale => { self.invalidations += 1; self.misses += 1; let block = build_block(pc, mem, cur_pv); self.slots[idx] = Some(Box::new(block)); } CacheStatus::Miss => { self.misses += 1; let block = build_block(pc, mem, cur_pv); self.slots[idx] = Some(Box::new(block)); } } // Slot is guaranteed populated at this point — Hit returned a // pre-existing block, Miss/Stale just wrote a new one. self.slots[idx] .as_deref() .expect("block freshly built or hit") } } /// Walk forward from `pc`, decoding instructions and collecting them /// into a `DecodedBlock`. The walk stops on the first of: /// - a [`PpcOpcode::terminates_block`] true (the terminator IS /// included as the last instruction), /// - reaching [`MAX_BLOCK_INSTRS`], /// - the next PC would cross a 4 KiB guest page boundary. fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock { let mut instrs: Vec = Vec::with_capacity(8); let page_base = start_pc & GUEST_PAGE_MASK; let mut cur = start_pc; loop { let raw = mem.read_u32(cur); let decoded = decode(raw, cur); let terminates = decoded.opcode.terminates_block(); instrs.push(decoded); if terminates { break; } if instrs.len() >= MAX_BLOCK_INSTRS { break; } let next = cur.wrapping_add(4); if (next & GUEST_PAGE_MASK) != page_base { break; } cur = next; } let last = instrs.last().expect("build pushes at least one instruction"); let end_pc = last.addr.wrapping_add(4); DecodedBlock { start_pc, end_pc, page_version, instrs, } } #[cfg(test)] mod tests { use super::*; use crate::opcode::PpcOpcode; use std::cell::Cell; /// 64 KiB byte-array memory, big-endian word reads. /// Mirrors `interpreter::tests::TestMem` but lives here so block_cache /// tests don't depend on interpreter internals. struct BlockTestMem { data: Box<[Cell]>, version_a: u64, version_b: u64, // Address of the page whose version is `version_b` instead of // `version_a`. Used to model an out-of-band page-version bump in // the invalidation test without going through write_*. bumped_page: Cell>, } impl BlockTestMem { fn new() -> Self { Self { data: (0..0x10000u32).map(|_| Cell::new(0)).collect(), version_a: 1, version_b: 2, bumped_page: Cell::new(None), } } fn put(&self, addr: u32, raw: u32) { let a = addr as usize; for (i, byte) in raw.to_be_bytes().iter().enumerate() { self.data[a + i].set(*byte); } } } impl MemoryAccess for BlockTestMem { fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() } fn read_u16(&self, a: u32) -> u16 { let i = a as usize; u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()]) } fn read_u32(&self, a: u32) -> u32 { let i = a as usize; u32::from_be_bytes([ self.data[i].get(), self.data[i + 1].get(), self.data[i + 2].get(), self.data[i + 3].get(), ]) } fn read_u64(&self, a: u32) -> u64 { let i = a as usize; u64::from_be_bytes([ self.data[i].get(), self.data[i + 1].get(), self.data[i + 2].get(), self.data[i + 3].get(), self.data[i + 4].get(), self.data[i + 5].get(), self.data[i + 6].get(), self.data[i + 7].get(), ]) } fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); } fn write_u16(&self, a: u32, v: u16) { let i = a as usize; let b = v.to_be_bytes(); self.data[i].set(b[0]); self.data[i + 1].set(b[1]); } fn write_u32(&self, a: u32, v: u32) { let i = a as usize; for (k, byte) in v.to_be_bytes().iter().enumerate() { self.data[i + k].set(*byte); } } fn write_u64(&self, a: u32, v: u64) { let i = a as usize; for (k, byte) in v.to_be_bytes().iter().enumerate() { self.data[i + k].set(*byte); } } fn translate(&self, _: u32) -> Option<*const u8> { None } fn translate_mut(&self, _: u32) -> Option<*mut u8> { None } fn page_version(&self, addr: u32) -> u64 { if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() { self.version_b } else { self.version_a } } } // PPC encodings — minimal subset for these tests. fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 { (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32) } fn enc_b_self() -> u32 { // b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0. 18 << 26 } fn enc_unimplemented() -> u32 { // Use opcode 0 raw = 0; decoder maps to Invalid. 0 } #[test] fn block_built_to_terminator() { let mem = BlockTestMem::new(); mem.put(0x100, enc_addi(3, 3, 1)); mem.put(0x104, enc_addi(3, 3, 1)); mem.put(0x108, enc_addi(3, 3, 1)); mem.put(0x10C, enc_b_self()); // terminator let mut bc = BlockCache::new(); let b = bc.lookup_or_build(0x100, &mem); assert_eq!(b.start_pc, 0x100); assert_eq!(b.instrs.len(), 4); assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx); assert_eq!(b.end_pc, 0x110); } #[test] fn block_stops_at_page_boundary() { // Build from 0x1FFC. The next PC (0x2000) is in a different // 4 KiB page — block must contain only the one instruction. let mem = BlockTestMem::new(); mem.put(0x1FFC, enc_addi(3, 3, 1)); mem.put(0x2000, enc_addi(3, 3, 1)); let mut bc = BlockCache::new(); let b = bc.lookup_or_build(0x1FFC, &mem); assert_eq!(b.instrs.len(), 1); assert_eq!(b.end_pc, 0x2000); } #[test] fn block_stops_at_max_len() { // 64 consecutive non-terminator instructions on one page — // block must clamp at MAX_BLOCK_INSTRS. let mem = BlockTestMem::new(); for i in 0..64u32 { mem.put(0x100 + i * 4, enc_addi(3, 3, 1)); } let mut bc = BlockCache::new(); let b = bc.lookup_or_build(0x100, &mem); assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS); assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4); } #[test] fn block_stops_at_invalid_opcode() { // Decoder mapping `Invalid` is treated as a block terminator // so the per-instruction Unimplemented path is preserved. let mem = BlockTestMem::new(); mem.put(0x100, enc_addi(3, 3, 1)); mem.put(0x104, enc_unimplemented()); mem.put(0x108, enc_addi(3, 3, 1)); let mut bc = BlockCache::new(); let b = bc.lookup_or_build(0x100, &mem); assert_eq!(b.instrs.len(), 2); assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid); } #[test] fn block_invalidates_on_page_version_bump() { let mem = BlockTestMem::new(); mem.put(0x100, enc_addi(3, 3, 1)); mem.put(0x104, enc_b_self()); let mut bc = BlockCache::new(); let _ = bc.lookup_or_build(0x100, &mem); assert_eq!(bc.misses(), 1); assert_eq!(bc.hits(), 0); // Same call → hit. let _ = bc.lookup_or_build(0x100, &mem); assert_eq!(bc.hits(), 1); assert_eq!(bc.invalidations(), 0); // Bump the page version on the page containing 0x100. Next // lookup must invalidate and rebuild. mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK)); let _ = bc.lookup_or_build(0x100, &mem); assert_eq!(bc.invalidations(), 1); assert_eq!(bc.misses(), 2); } #[test] fn block_hit_returns_same_contents() { // Sanity: cache hit returns a block whose contents reflect the // ORIGINAL instruction stream, even after a non-version-bumping // poke to the underlying bytes. (No real workload would do // this, but it confirms we're returning cached data, not // re-reading.) let mem = BlockTestMem::new(); mem.put(0x100, enc_addi(3, 3, 7)); mem.put(0x104, enc_b_self()); let mut bc = BlockCache::new(); let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16(); // Rewrite without bumping version (test-only path). let bytes = enc_addi(3, 3, 99).to_be_bytes(); for (i, b) in bytes.iter().enumerate() { mem.data[0x100 + i].set(*b); } let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16(); assert_eq!(first_simm, 7); assert_eq!(cached_simm, 7, "cache must serve original decoded form"); } }