xenia-cpu: VMX128, FPSCR, decoder split, scheduler, decode/block caches
Split the monolithic interpreter into cohesive modules: dedicated decoder (decoder.rs) producing 8-byte DecodedInstr; opcode tables (opcode.rs); explicit traps (trap.rs); FPSCR helpers (fpscr.rs); overflow/carry helpers (overflow.rs); a 4 KiB-page-versioned decode cache and basic-block cache (block_cache.rs); and a full VMX/VMX128 implementation (vmx.rs) covering AltiVec + Xenon's 128-bit extensions. Add the parallel-execution substrate behind --parallel: a 7-party phaser (phaser.rs) for round-based barrier sync, ReservationTable (reservation.rs) for guest LL/SC, and the per-HW-thread scheduler core (scheduler.rs) that owns ThreadRefs, runqueues, and pending IRQs. Disassembler is now the single source of truth: disasm.rs gains the full base + extended + VMX128 mnemonic set, with golden JSON fixtures and a disasm_goldens test suite. Add a criterion-style interpreter bench. context.rs grows the per-thread state the new modules need (reservation slot, FPSCR, vector regs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -77,6 +77,9 @@ impl DecodedInstr {
|
||||
/// OE bit (bit 21) - overflow enable
|
||||
#[inline] pub fn oe(&self) -> bool { extract_bits(self.raw, 21, 21) != 0 }
|
||||
|
||||
/// TO field (bits 6-10) for tw/twi/td/tdi trap instructions.
|
||||
#[inline] pub fn to(&self) -> u32 { extract_bits(self.raw, 6, 10) }
|
||||
|
||||
/// MB, ME fields for rotate instructions
|
||||
#[inline] pub fn mb(&self) -> u32 { extract_bits(self.raw, 21, 25) }
|
||||
#[inline] pub fn me(&self) -> u32 { extract_bits(self.raw, 26, 30) }
|
||||
@@ -142,6 +145,24 @@ impl DecodedInstr {
|
||||
#[inline] pub fn nb(&self) -> u32 { extract_bits(self.raw, 16, 20) }
|
||||
}
|
||||
|
||||
/// Extract the 5-bit `UIMM` (`VX128_3`) / `IMM` (`VX128_4`) field. Canary
|
||||
/// packs both formats with LSB-bits 16-20 holding the field, which is
|
||||
/// MSB bits 11-15 in our `extract_bits` convention. For `vpkd3d128` /
|
||||
/// `vupkd3d128` the decoded selector is `type = UIMM >> 2` (3 bits; valid
|
||||
/// values 0-6 per [`crate::vmx::D3dPackType`], 7 is undocumented /
|
||||
/// undefined in canary) and `pack = UIMM & 0x3` (output-slot layout for
|
||||
/// `vpkd3d128` only, `vupkd3d128` ignores it).
|
||||
///
|
||||
/// First-Pixels M3: the interpreter previously used a hand-rolled
|
||||
/// `(instr.raw >> 6) & 0x7` that was **LSB-numbered** and extracted
|
||||
/// bits from a completely different part of the word (the
|
||||
/// secondary-opcode region). Centralizing the extractor here matches
|
||||
/// canary's `FormatVX128_{3,4}::{UIMM,IMM}` field semantics exactly.
|
||||
#[inline]
|
||||
pub fn extract_vx128_uimm5(raw: u32) -> u32 {
|
||||
extract_bits(raw, 11, 15)
|
||||
}
|
||||
|
||||
/// Decode a 32-bit PPC instruction into its opcode.
|
||||
/// Direct translation of the C++ LookupOpcode from ppc_opcode_lookup_gen.cc.
|
||||
pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
|
||||
@@ -149,6 +170,123 @@ pub fn decode(raw: u32, addr: u32) -> DecodedInstr {
|
||||
DecodedInstr { opcode, raw, addr }
|
||||
}
|
||||
|
||||
// Perf tier-2 — direct-mapped PC-keyed decode cache.
|
||||
//
|
||||
// The interpreter hot path spends ~15-25% of its time in `decode()`
|
||||
// parsing the raw u32 and walking the primary+secondary opcode tables.
|
||||
// For non-self-modifying guest code — the common case past the XEX
|
||||
// loader — `decode(raw, pc)` is purely a function of `(raw, pc)` and
|
||||
// the output is `Copy + 16B`. A direct-mapped cache indexed by
|
||||
// `(pc >> 2) & MASK` gives the interpreter a 1-comparison fast path,
|
||||
// at the cost of one branch and a 1.5 MiB region of memory.
|
||||
//
|
||||
// Invalidation piggybacks on `xenia_memory::GuestMemory::page_version`
|
||||
// (P5 texture-cache invalidation): every cache entry carries the page
|
||||
// version that was active at decode time; on lookup we compare against
|
||||
// the current version of the containing 4 KiB page. Any write to the
|
||||
// page bumps the counter, so the next decode on that PC is a miss that
|
||||
// refills.
|
||||
|
||||
/// Number of direct-mapped entries. 2^16 = 65,536 slots, one PPC
|
||||
/// instruction address per slot — enough for every hot code path in a
|
||||
/// typical Xbox 360 title to stay resident without collision.
|
||||
const DECODE_CACHE_SIZE: usize = 1 << 16;
|
||||
const DECODE_CACHE_MASK: u32 = (DECODE_CACHE_SIZE - 1) as u32;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct DecodeCacheEntry {
|
||||
/// Guest PC this entry was decoded at. Used as the tag on lookup; a
|
||||
/// mismatch means the slot was last populated by a different PC that
|
||||
/// shares the same low-16 index.
|
||||
pc: u32,
|
||||
/// Page version at decode time (from `GuestMemory::page_version(pc)`).
|
||||
/// Zero means "unused slot" since real page versions start at 1.
|
||||
page_version: u64,
|
||||
decoded: DecodedInstr,
|
||||
}
|
||||
|
||||
impl DecodeCacheEntry {
|
||||
const fn empty() -> Self {
|
||||
// `Invalid` is the decoder's "unrecognized opcode" sentinel; we
|
||||
// use it here as the empty-slot marker. Real misses compare `pc`,
|
||||
// not the opcode, so the sentinel choice is cosmetic.
|
||||
Self {
|
||||
pc: 0,
|
||||
page_version: 0,
|
||||
decoded: DecodedInstr {
|
||||
opcode: PpcOpcode::Invalid,
|
||||
raw: 0,
|
||||
addr: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Direct-mapped PC-keyed decode cache. One instance shared across all
|
||||
/// HW threads (PC is thread-independent; entries are read-only once
|
||||
/// filled). Not thread-safe — the single scheduler thread owns it.
|
||||
pub struct DecodeCache {
|
||||
slots: Box<[DecodeCacheEntry]>,
|
||||
hits: u64,
|
||||
misses: u64,
|
||||
invalidations: u64,
|
||||
}
|
||||
|
||||
impl Default for DecodeCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl DecodeCache {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
slots: vec![DecodeCacheEntry::empty(); DECODE_CACHE_SIZE].into_boxed_slice(),
|
||||
hits: 0,
|
||||
misses: 0,
|
||||
invalidations: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up (or fill) the decoded form of the instruction at `pc`.
|
||||
/// `raw` is the fetched instruction word; `current_page_version` is
|
||||
/// `mem.page_version(pc)` — the caller has it cheaper than we do,
|
||||
/// since they're already touching `mem` to fetch `raw`.
|
||||
#[inline]
|
||||
pub fn lookup(&mut self, pc: u32, raw: u32, current_page_version: u64) -> DecodedInstr {
|
||||
let idx = ((pc >> 2) & DECODE_CACHE_MASK) as usize;
|
||||
// Safety: `idx` is masked into `[0, DECODE_CACHE_SIZE)` so the
|
||||
// slice access is always in-bounds. Opt-out of the bounds check
|
||||
// for the hot path.
|
||||
let entry = unsafe { self.slots.get_unchecked_mut(idx) };
|
||||
if entry.pc == pc && entry.page_version == current_page_version {
|
||||
self.hits += 1;
|
||||
return entry.decoded;
|
||||
}
|
||||
if entry.pc == pc && entry.page_version != current_page_version {
|
||||
self.invalidations += 1;
|
||||
}
|
||||
self.misses += 1;
|
||||
let decoded = decode(raw, pc);
|
||||
*entry = DecodeCacheEntry {
|
||||
pc,
|
||||
page_version: current_page_version,
|
||||
decoded,
|
||||
};
|
||||
decoded
|
||||
}
|
||||
|
||||
pub fn hits(&self) -> u64 {
|
||||
self.hits
|
||||
}
|
||||
pub fn misses(&self) -> u64 {
|
||||
self.misses
|
||||
}
|
||||
pub fn invalidations(&self) -> u64 {
|
||||
self.invalidations
|
||||
}
|
||||
}
|
||||
|
||||
fn lookup_opcode(code: u32) -> PpcOpcode {
|
||||
match extract_bits(code, 0, 5) {
|
||||
2 => PpcOpcode::tdi,
|
||||
@@ -781,6 +919,57 @@ mod tests {
|
||||
assert_eq!(instr.d(), 0x20);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_miss_fills_then_hit() {
|
||||
let mut cache = DecodeCache::new();
|
||||
let raw: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let pc = 0x8200_0000u32;
|
||||
let first = cache.lookup(pc, raw, 1);
|
||||
assert_eq!(first.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.hits(), 0);
|
||||
assert_eq!(cache.misses(), 1);
|
||||
// Same pc, same version → cache hit, no new decode.
|
||||
let second = cache.lookup(pc, raw, 1);
|
||||
assert_eq!(second.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.hits(), 1);
|
||||
assert_eq!(cache.misses(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_stale_version_refills() {
|
||||
let mut cache = DecodeCache::new();
|
||||
// First fill with an `addi`.
|
||||
let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let pc = 0x8200_0000u32;
|
||||
cache.lookup(pc, raw_addi, 1);
|
||||
// Guest rewrote the page: same pc, different raw + bumped version.
|
||||
// Cache must refill — not return the stale `addi`.
|
||||
let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
|
||||
let refreshed = cache.lookup(pc, raw_lwz, 2);
|
||||
assert_eq!(refreshed.opcode, PpcOpcode::lwz);
|
||||
assert_eq!(cache.invalidations(), 1);
|
||||
assert_eq!(cache.misses(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_cache_pc_collision_refills() {
|
||||
// Two PCs that hash to the same slot (pc >> 2 low 16 bits equal)
|
||||
// must not alias. Slot index = ((pc >> 2) & 0xFFFF) — pick two
|
||||
// PCs 4 * 2^16 bytes apart.
|
||||
let mut cache = DecodeCache::new();
|
||||
let pc_a = 0x8200_0000u32;
|
||||
let pc_b = pc_a.wrapping_add(0x0004_0000u32); // (>> 2) differs by 2^16
|
||||
let raw_addi: u32 = (14 << 26) | (3 << 21) | (1 << 16) | 0x10;
|
||||
let raw_lwz: u32 = (32 << 26) | (5 << 21) | (1 << 16) | 0x20;
|
||||
cache.lookup(pc_a, raw_addi, 1);
|
||||
// Different pc but same slot → miss + refill.
|
||||
cache.lookup(pc_b, raw_lwz, 1);
|
||||
// First pc comes back → miss + refill (slot was taken by pc_b).
|
||||
let back = cache.lookup(pc_a, raw_addi, 1);
|
||||
assert_eq!(back.opcode, PpcOpcode::addi);
|
||||
assert_eq!(cache.misses(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_branch() {
|
||||
// b +0x100 => opcode 18, LI=0x40 (shifted left 2 = 0x100), AA=0, LK=0
|
||||
@@ -816,4 +1005,103 @@ mod tests {
|
||||
assert_eq!(extract_bits(0x8000_0000, 0, 0), 1);
|
||||
assert_eq!(extract_bits(0x0000_0001, 31, 31), 1);
|
||||
}
|
||||
|
||||
// VMX128 register-name extraction. Locks the canonical bit positions
|
||||
// (decoder.rs is the single source of truth — the analysis crate's
|
||||
// old `ppc.rs` had different positions, which produced wrong printed
|
||||
// register names; the bug was silent because the interpreter never
|
||||
// used those extractors). Each test poke-bits exactly the slots the
|
||||
// accessor reads and asserts the assembled register number.
|
||||
|
||||
fn vmx128_test_word(va21: u32, vd6_10: u32, vd21: u32, vd22: u32,
|
||||
vb16_20: u32, vb28: u32, vb30: u32) -> u32 {
|
||||
// PPC bit i -> LSB position 31-i.
|
||||
(vd6_10 << (31 - 10))
|
||||
| (va21 << (31 - 21)) // va128 high bit at PPC 29 in some forms — kept 0 here
|
||||
| (vd21 << (31 - 21))
|
||||
| (vd22 << (31 - 22))
|
||||
| (vb16_20 << (31 - 20))
|
||||
| (vb28 << (31 - 28))
|
||||
| (vb30 << (31 - 30))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_low_5_bits_only() {
|
||||
// vd_lo = 0..31, vd_b21 = 0, vd_b22 = 0 → vd128 = vd_lo
|
||||
for r in 0..32u32 {
|
||||
let raw = (r as u32) << (31 - 10);
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), r as usize, "vd_lo={r}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_bit21_adds_32() {
|
||||
// vd_lo = 0, vd_b21 = 1, vd_b22 = 0 → vd128 = 32
|
||||
let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 21));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_bit22_adds_64() {
|
||||
// vd_lo = 0, vd_b21 = 0, vd_b22 = 1 → vd128 = 64
|
||||
let raw = (0u32 << (31 - 10)) | (1u32 << (31 - 22));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vd128_full_127() {
|
||||
// vd_lo = 31, vd_b21 = 1, vd_b22 = 1 → vd128 = 127
|
||||
let raw = (31u32 << (31 - 10))
|
||||
| (1u32 << (31 - 21))
|
||||
| (1u32 << (31 - 22));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 127);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_va128_uses_bit29() {
|
||||
// va128 = bits 6-10 + bit 29. va_lo = 7, bit 29 = 1 → va128 = 7 | 32 = 39.
|
||||
let raw = (7u32 << (31 - 10)) | (1u32 << (31 - 29));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.va128(), 39);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vb128_uses_bits28_and_30() {
|
||||
// vb128 = bits 16-20 + bit 28 + bit 30. Low 5 = 5, bit 28 = 1 → +32, bit 30 = 1 → +64.
|
||||
let raw = (5u32 << (31 - 20))
|
||||
| (1u32 << (31 - 28))
|
||||
| (1u32 << (31 - 30));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vb128(), 5 | 32 | 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vmx128_vs128_aliases_vd128() {
|
||||
// vs128 must always equal vd128.
|
||||
for r in [0u32, 31, 32, 64, 96, 127] {
|
||||
let lo = r & 0x1F;
|
||||
let b21 = (r >> 5) & 1;
|
||||
let b22 = (r >> 6) & 1;
|
||||
let raw = (lo << (31 - 10))
|
||||
| (b21 << (31 - 21))
|
||||
| (b22 << (31 - 22));
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), r as usize, "vd128 mismatch for r={r}");
|
||||
assert_eq!(d.vs128(), r as usize, "vs128 mismatch for r={r}");
|
||||
assert_eq!(d.vd128(), d.vs128());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[allow(dead_code)]
|
||||
fn _vmx128_test_word_helper_compiles() {
|
||||
// Keep the helper validated against the real accessor.
|
||||
let raw = vmx128_test_word(0, 5, 1, 1, 0, 0, 0);
|
||||
let d = DecodedInstr { opcode: PpcOpcode::Invalid, raw, addr: 0 };
|
||||
assert_eq!(d.vd128(), 5 | 32 | 64);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user