xenia-rs/crates/xenia-cpu/src/block_cache.rs

//! Tier-4 perf — basic-block cache for the PPC interpreter.
//!
//! `DecodeCache` (in [`crate::decoder`]) caches one decoded instruction
//! per slot, indexed by PC. The hot loop still pays the per-instruction
//! cost of fetching the raw word, hashing the PC into a slot, and
//! comparing tags. For straight-line code — common in the asset/inflate
//! loops where Sylpheed boot is currently CPU-bound — the savings of
//! batching N decoded instructions per slot lookup are linear in block
//! length.
//!
//! ## Shape
//!
//! A `DecodedBlock` is a contiguous run of decoded instructions starting
//! at `start_pc`, ending at the first *block terminator* (any branch,
//! `sc`, trap, or `Invalid`) or at one of two safety limits:
//!
//!  - [`MAX_BLOCK_INSTRS`] caps memory growth and re-build cost.
//!  - 4 KiB page boundary stop. A block is fully contained inside a
//!    single 4 KiB guest page; that means `mem.page_version(start_pc)`
//!    is sufficient to detect any code-page rewrite that should
//!    invalidate the block. Without this rule the cache would have to
//!    walk every spanned page on every hit, which would erase the win.
//!
//! ## Invalidation
//!
//! Each block stamps the page version at build time. On lookup, if
//! `mem.page_version(start_pc)` differs from `block.page_version`, the
//! slot is rebuilt. Same mechanism `DecodeCache` uses, just at
//! block granularity.
//!
//! ## Debugger semantics
//!
//! Block dispatch is **opt-in** by the caller. The hot loop in
//! `xenia-app/src/main.rs` selects the per-instruction path whenever
//! `Debugger::wants_hooks()` is true or any `--trace-*` flag is set.
//! That's how single-step, breakpoints, in-memory trace, instruction
//! trace, and branch trace continue to observe every PC: the block
//! cache simply never runs in those modes.

use crate::decoder::{decode, DecodedInstr};
use xenia_memory::MemoryAccess;

/// Direct-mapped block-cache slot count. Same shape as
/// [`crate::decoder::DECODE_CACHE_SIZE`] — 64 K slots indexed by the
/// low 16 bits of `start_pc >> 2`. With Sylpheed-class workloads the
/// slot collision rate is negligible.
const BLOCK_CACHE_SIZE: usize = 1 << 16;
const BLOCK_CACHE_MASK: u32 = (BLOCK_CACHE_SIZE - 1) as u32;

/// Hard cap on instructions per block. Keeps the worst-case memory
/// footprint bounded and limits the rebuild cost when a code page
/// gets bumped. 32 instructions is generous for most basic blocks
/// (real-world average across Sylpheed boot is ~6 between branches).
pub const MAX_BLOCK_INSTRS: usize = 32;

/// Guest page size — duplicated here to avoid pulling
/// `xenia-memory::heap` internals into `xenia-cpu`. Must stay in sync
/// with the memory crate. Both refer to the architectural PowerPC 4 KiB
/// page granule, so this constant is locked.
const GUEST_PAGE_SIZE: u32 = 4096;
const GUEST_PAGE_MASK: u32 = !(GUEST_PAGE_SIZE - 1);

/// One cached basic block. Owned by [`BlockCache`]; a `&DecodedBlock`
/// is handed to the interpreter via [`BlockCache::lookup_or_build`] and
/// stays valid until the next `lookup_or_build` on the same slot.
#[derive(Debug)]
pub struct DecodedBlock {
    /// Guest PC at which this block starts. Used as the slot tag.
    pub start_pc: u32,
    /// Guest PC immediately after the last instruction in `instrs`.
    /// Equal to `instrs.last().addr + 4` whether or not the block
    /// ended on a terminator. Useful for tracing / disassembly.
    pub end_pc: u32,
    /// `mem.page_version(start_pc)` at build time. Mismatch on lookup
    /// invalidates the block. Single value because every block is
    /// page-bounded by construction.
    pub page_version: u64,
    /// Decoded instructions in execution order. Always non-empty after
    /// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
    /// pushes the first decoded word unconditionally).
    pub instrs: Vec<DecodedInstr>,
}

/// Per-slot status from a `lookup_or_build` probe. Internal only.
enum CacheStatus {
    /// Block at this slot matches `pc` and the page version at build
    /// time matches `mem.page_version(pc)` — return as-is.
    Hit,
    /// Block at this slot matched `pc` but the page version has
    /// advanced — rebuild and bump `invalidations`.
    Stale,
    /// Slot is empty or holds a block keyed at a different `start_pc`.
    /// Build a fresh block and bump `misses`.
    Miss,
}

/// Direct-mapped block cache. One instance shared across all HW slots
/// (block contents are PC-only and read-only after fill). Not
/// thread-safe — owner is the single scheduler thread, same as
/// `DecodeCache`.
pub struct BlockCache {
    slots: Box<[Option<Box<DecodedBlock>>]>,
    hits: u64,
    misses: u64,
    invalidations: u64,
}

impl Default for BlockCache {
    fn default() -> Self {
        Self::new()
    }
}

impl BlockCache {
    pub fn new() -> Self {
        // `Option<Box<T>>` is a niche-optimized 8-byte slot; 64 K of
        // them cost ~512 KiB of cold storage. Live blocks beyond that
        // sit on the heap.
        let mut v: Vec<Option<Box<DecodedBlock>>> = Vec::with_capacity(BLOCK_CACHE_SIZE);
        v.resize_with(BLOCK_CACHE_SIZE, || None);
        Self {
            slots: v.into_boxed_slice(),
            hits: 0,
            misses: 0,
            invalidations: 0,
        }
    }

    pub fn hits(&self) -> u64 {
        self.hits
    }
    pub fn misses(&self) -> u64 {
        self.misses
    }
    pub fn invalidations(&self) -> u64 {
        self.invalidations
    }

    /// Return the cached block starting at `pc`, building it if absent
    /// or stale. The returned reference is borrowed from the cache and
    /// stays valid until the next `lookup_or_build` call.
    pub fn lookup_or_build(&mut self, pc: u32, mem: &dyn MemoryAccess) -> &DecodedBlock {
        let idx = ((pc >> 2) & BLOCK_CACHE_MASK) as usize;
        let cur_pv = mem.page_version(pc);

        // Phase 1: classify the slot. Borrow ends before fill so the
        // mutable update below doesn't conflict.
        let status = match &self.slots[idx] {
            Some(b) if b.start_pc == pc && b.page_version == cur_pv => CacheStatus::Hit,
            Some(b) if b.start_pc == pc => CacheStatus::Stale,
            _ => CacheStatus::Miss,
        };

        // Phase 2: fill on miss/stale, account.
        match status {
            CacheStatus::Hit => {
                self.hits += 1;
            }
            CacheStatus::Stale => {
                self.invalidations += 1;
                self.misses += 1;
                let block = build_block(pc, mem, cur_pv);
                self.slots[idx] = Some(Box::new(block));
            }
            CacheStatus::Miss => {
                self.misses += 1;
                let block = build_block(pc, mem, cur_pv);
                self.slots[idx] = Some(Box::new(block));
            }
        }

        // Slot is guaranteed populated at this point — Hit returned a
        // pre-existing block, Miss/Stale just wrote a new one.
        self.slots[idx]
            .as_deref()
            .expect("block freshly built or hit")
    }
}

/// Walk forward from `pc`, decoding instructions and collecting them
/// into a `DecodedBlock`. The walk stops on the first of:
///   - a [`PpcOpcode::terminates_block`] true (the terminator IS
///     included as the last instruction),
///   - reaching [`MAX_BLOCK_INSTRS`],
///   - the next PC would cross a 4 KiB guest page boundary.
fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> DecodedBlock {
    let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
    let page_base = start_pc & GUEST_PAGE_MASK;
    let mut cur = start_pc;

    loop {
        let raw = mem.read_u32(cur);
        let decoded = decode(raw, cur);
        let terminates = decoded.opcode.terminates_block();
        instrs.push(decoded);

        if terminates {
            break;
        }
        if instrs.len() >= MAX_BLOCK_INSTRS {
            break;
        }
        let next = cur.wrapping_add(4);
        if (next & GUEST_PAGE_MASK) != page_base {
            break;
        }
        cur = next;
    }

    let last = instrs.last().expect("build pushes at least one instruction");
    let end_pc = last.addr.wrapping_add(4);

    DecodedBlock {
        start_pc,
        end_pc,
        page_version,
        instrs,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::opcode::PpcOpcode;

    use std::cell::Cell;

    /// 64 KiB byte-array memory, big-endian word reads.
    /// Mirrors `interpreter::tests::TestMem` but lives here so block_cache
    /// tests don't depend on interpreter internals.
    struct BlockTestMem {
        data: Box<[Cell<u8>]>,
        version_a: u64,
        version_b: u64,
        // Address of the page whose version is `version_b` instead of
        // `version_a`. Used to model an out-of-band page-version bump in
        // the invalidation test without going through write_*.
        bumped_page: Cell<Option<u32>>,
    }

    impl BlockTestMem {
        fn new() -> Self {
            Self {
                data: (0..0x10000u32).map(|_| Cell::new(0)).collect(),
                version_a: 1,
                version_b: 2,
                bumped_page: Cell::new(None),
            }
        }
        fn put(&self, addr: u32, raw: u32) {
            let a = addr as usize;
            for (i, byte) in raw.to_be_bytes().iter().enumerate() {
                self.data[a + i].set(*byte);
            }
        }
    }

    impl MemoryAccess for BlockTestMem {
        fn read_u8(&self, a: u32) -> u8 { self.data[a as usize].get() }
        fn read_u16(&self, a: u32) -> u16 {
            let i = a as usize;
            u16::from_be_bytes([self.data[i].get(), self.data[i + 1].get()])
        }
        fn read_u32(&self, a: u32) -> u32 {
            let i = a as usize;
            u32::from_be_bytes([
                self.data[i].get(), self.data[i + 1].get(),
                self.data[i + 2].get(), self.data[i + 3].get(),
            ])
        }
        fn read_u64(&self, a: u32) -> u64 {
            let i = a as usize;
            u64::from_be_bytes([
                self.data[i].get(), self.data[i + 1].get(),
                self.data[i + 2].get(), self.data[i + 3].get(),
                self.data[i + 4].get(), self.data[i + 5].get(),
                self.data[i + 6].get(), self.data[i + 7].get(),
            ])
        }
        fn write_u8(&self, a: u32, v: u8) { self.data[a as usize].set(v); }
        fn write_u16(&self, a: u32, v: u16) {
            let i = a as usize;
            let b = v.to_be_bytes();
            self.data[i].set(b[0]);
            self.data[i + 1].set(b[1]);
        }
        fn write_u32(&self, a: u32, v: u32) {
            let i = a as usize;
            for (k, byte) in v.to_be_bytes().iter().enumerate() {
                self.data[i + k].set(*byte);
            }
        }
        fn write_u64(&self, a: u32, v: u64) {
            let i = a as usize;
            for (k, byte) in v.to_be_bytes().iter().enumerate() {
                self.data[i + k].set(*byte);
            }
        }
        fn translate(&self, _: u32) -> Option<*const u8> { None }
        fn translate_mut(&self, _: u32) -> Option<*mut u8> { None }
        fn page_version(&self, addr: u32) -> u64 {
            if Some(addr & GUEST_PAGE_MASK) == self.bumped_page.get() {
                self.version_b
            } else {
                self.version_a
            }
        }
    }

    // PPC encodings — minimal subset for these tests.
    fn enc_addi(rd: u32, ra: u32, simm: i16) -> u32 {
        (14 << 26) | (rd << 21) | (ra << 16) | (simm as u16 as u32)
    }
    fn enc_b_self() -> u32 {
        // b 0 — branch to self (LI=0). Opcode=18, AA=0, LK=0.
        18 << 26
    }
    fn enc_unimplemented() -> u32 {
        // Use opcode 0 raw = 0; decoder maps to Invalid.
        0
    }

    #[test]
    fn block_built_to_terminator() {
        let mem = BlockTestMem::new();
        mem.put(0x100, enc_addi(3, 3, 1));
        mem.put(0x104, enc_addi(3, 3, 1));
        mem.put(0x108, enc_addi(3, 3, 1));
        mem.put(0x10C, enc_b_self()); // terminator
        let mut bc = BlockCache::new();
        let b = bc.lookup_or_build(0x100, &mem);
        assert_eq!(b.start_pc, 0x100);
        assert_eq!(b.instrs.len(), 4);
        assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::bx);
        assert_eq!(b.end_pc, 0x110);
    }

    #[test]
    fn block_stops_at_page_boundary() {
        // Build from 0x1FFC. The next PC (0x2000) is in a different
        // 4 KiB page — block must contain only the one instruction.
        let mem = BlockTestMem::new();
        mem.put(0x1FFC, enc_addi(3, 3, 1));
        mem.put(0x2000, enc_addi(3, 3, 1));
        let mut bc = BlockCache::new();
        let b = bc.lookup_or_build(0x1FFC, &mem);
        assert_eq!(b.instrs.len(), 1);
        assert_eq!(b.end_pc, 0x2000);
    }

    #[test]
    fn block_stops_at_max_len() {
        // 64 consecutive non-terminator instructions on one page —
        // block must clamp at MAX_BLOCK_INSTRS.
        let mem = BlockTestMem::new();
        for i in 0..64u32 {
            mem.put(0x100 + i * 4, enc_addi(3, 3, 1));
        }
        let mut bc = BlockCache::new();
        let b = bc.lookup_or_build(0x100, &mem);
        assert_eq!(b.instrs.len(), MAX_BLOCK_INSTRS);
        assert_eq!(b.end_pc, 0x100 + (MAX_BLOCK_INSTRS as u32) * 4);
    }

    #[test]
    fn block_stops_at_invalid_opcode() {
        // Decoder mapping `Invalid` is treated as a block terminator
        // so the per-instruction Unimplemented path is preserved.
        let mem = BlockTestMem::new();
        mem.put(0x100, enc_addi(3, 3, 1));
        mem.put(0x104, enc_unimplemented());
        mem.put(0x108, enc_addi(3, 3, 1));
        let mut bc = BlockCache::new();
        let b = bc.lookup_or_build(0x100, &mem);
        assert_eq!(b.instrs.len(), 2);
        assert_eq!(b.instrs.last().unwrap().opcode, PpcOpcode::Invalid);
    }

    #[test]
    fn block_invalidates_on_page_version_bump() {
        let mem = BlockTestMem::new();
        mem.put(0x100, enc_addi(3, 3, 1));
        mem.put(0x104, enc_b_self());
        let mut bc = BlockCache::new();
        let _ = bc.lookup_or_build(0x100, &mem);
        assert_eq!(bc.misses(), 1);
        assert_eq!(bc.hits(), 0);

        // Same call → hit.
        let _ = bc.lookup_or_build(0x100, &mem);
        assert_eq!(bc.hits(), 1);
        assert_eq!(bc.invalidations(), 0);

        // Bump the page version on the page containing 0x100. Next
        // lookup must invalidate and rebuild.
        mem.bumped_page.set(Some(0x100 & GUEST_PAGE_MASK));
        let _ = bc.lookup_or_build(0x100, &mem);
        assert_eq!(bc.invalidations(), 1);
        assert_eq!(bc.misses(), 2);
    }

    #[test]
    fn block_hit_returns_same_contents() {
        // Sanity: cache hit returns a block whose contents reflect the
        // ORIGINAL instruction stream, even after a non-version-bumping
        // poke to the underlying bytes. (No real workload would do
        // this, but it confirms we're returning cached data, not
        // re-reading.)
        let mem = BlockTestMem::new();
        mem.put(0x100, enc_addi(3, 3, 7));
        mem.put(0x104, enc_b_self());
        let mut bc = BlockCache::new();
        let first_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
        // Rewrite without bumping version (test-only path).
        let bytes = enc_addi(3, 3, 99).to_be_bytes();
        for (i, b) in bytes.iter().enumerate() {
            mem.data[0x100 + i].set(*b);
        }
        let cached_simm = bc.lookup_or_build(0x100, &mem).instrs[0].simm16();
        assert_eq!(first_simm, 7);
        assert_eq!(cached_simm, 7, "cache must serve original decoded form");
    }
}