diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs index 0dee537..0e2378d 100644 --- a/crates/xenia-app/src/main.rs +++ b/crates/xenia-app/src/main.rs @@ -2326,8 +2326,19 @@ fn coord_post_round( let mut gpu_runs = (executed_this_round / xenia_cpu::scheduler::HW_THREAD_COUNT as u64) .max(1); - if gpu_runs > 64 { - gpu_runs = 64; + // Fairness cap on GPU commands drained per round. Must scale with the + // per-round instruction volume: with the superblock runner a single + // round legitimately retires up to ~SUPERBLOCK_INSTR_BUDGET per slot + // (vs ~6 for the old one-block path), so the rate `executed/6` is much + // higher and a flat cap of 64 throttled GPU command processing ~17× + // (packets 50279→1861 @50M) — collapsing the present loop / splash. + // Cap at the budget so the GPU keeps pace with the CPU at the same + // per-instruction rate the one-block path had. The inner loop already + // early-breaks on `!gpu.is_ready`, so this only bounds a pathological + // backlog, never busy-spins. + let gpu_cap = superblock_budget().max(64); + if gpu_runs > gpu_cap { + gpu_runs = gpu_cap; } if let Some(gpu) = kernel.gpu.as_inline_mut() { gpu.sync_with_mmio(); @@ -2812,6 +2823,160 @@ fn worker_epilogue( SlotOutcome::Continue } +/// Hard cap on the number of guest instructions a single superblock +/// runner invocation executes before returning to the round-robin +/// scheduler. Bounds how coarse the lockstep interleaving can get: a +/// larger budget amortizes more per-round/per-slot tax (faster) but +/// runs one HW thread for longer between scheduler returns (coarser +/// cross-thread interleaving). 1024 keeps a slot-visit ~170× longer +/// than the old single-block (~6 instr) granularity while still +/// returning to the round well inside a single 50k quantum. Purely an +/// instruction count → deterministic, schedule reproduces byte-identically. +/// +/// Tuned empirically on the Sylpheed boot-to-splash workload (iterate-3AL): +/// budgets up to 256 keep boot progression byte-for-byte healthy (draws / +/// swaps / packets track the one-block baseline), then a sharp cliff at +/// ~384 collapses the present loop (a producer/consumer boot handoff +/// starves when one slot runs too long without returning to the round). +/// 128 sits 3× below that cliff with ~1.65× boot-to-splash speedup — a +/// deliberately conservative pick (correctness over the last few %). The +/// `XENIA_SUPERBLOCK_BUDGET` env var overrides it for further tuning. +const SUPERBLOCK_INSTR_BUDGET: u64 = 128; + +/// Effective superblock budget. Defaults to [`SUPERBLOCK_INSTR_BUDGET`]; +/// `XENIA_SUPERBLOCK_BUDGET` overrides it (A/B tuning without a rebuild). +/// A budget of 1 reproduces the old one-block-per-slot-visit behaviour +/// (the chain always stops after the first block). Read once and cached. +fn superblock_budget() -> u64 { + use std::sync::OnceLock; + static BUDGET: OnceLock = OnceLock::new(); + *BUDGET.get_or_init(|| { + std::env::var("XENIA_SUPERBLOCK_BUDGET") + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|&v| v >= 1) + .unwrap_or(SUPERBLOCK_INSTR_BUDGET) + }) +} + +/// Superblock runner (iterate-3AL). Executes a *chain* of basic blocks +/// for one slot-visit — following each block's terminating branch into +/// the next block — instead of a single block, amortizing the per-round +/// (timebase / coord / `round_schedule`) and per-slot (`worker_prologue`) +/// dispatch tax over up to [`SUPERBLOCK_INSTR_BUDGET`] guest instructions. +/// +/// Determinism + cross-thread correctness: the chain ENDS (returns to the +/// round) at exactly the points where lockstep granularity matters, all +/// pure functions of guest state (never wall-clock): +/// - a non-`Continue` step result (Yield / SystemCall / Trap / Unimpl / +/// Halted) — `step_block` already bails on these; `Yield` in +/// particular is the db16cyc spin-wait hand-off that prevents a +/// spinner from starving its producer. +/// - the just-run block was `sync_sensitive` (reserved load/store or a +/// memory barrier) — the guest's own ordering points. +/// - the block touched MMIO (the `mem.mmio_access_count()` watermark +/// advanced) — GPU/register ordering vs other HW threads stays at the +/// same fine granularity as the old one-block path. +/// - the next PC leaves ordinary guest code: an import thunk, the halt +/// sentinel, or unmapped memory — those need the full `worker_prologue` +/// dispatch, so we stop and let the next round's prologue handle them. +/// - the instruction budget is reached. +/// +/// Instruction-count / clock accounting stays exact: `executed` is summed +/// from the per-block `cycle_count` delta across every chained block and +/// handed to `worker_epilogue` once, which advances `stats.instruction_count` +/// and `decrement_quantum` by precisely the retired count — identical to +/// dispatching each block separately. +#[allow(clippy::too_many_arguments)] +fn run_superblock( + wc: &mut WorkerCtx, + kernel: &mut xenia_kernel::KernelState, + mem: &xenia_memory::GuestMemory, + debugger: &mut xenia_debugger::Debugger, + thunk_map: &HashMap, + stats: &mut ExecStats, + tid: Option, + thread_ref: xenia_cpu::ThreadRef, + first_block_ptr: *const xenia_cpu::block_cache::DecodedBlock, + first_pc_before: u32, +) -> SlotOutcome { + use xenia_cpu::interpreter::{step_block, StepResult}; + const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32; + + let budget = superblock_budget(); + + // Probe / mem-watch / debugger-hook modes need per-block-entry + // observability; in those modes never chain (run exactly one block, + // identical to the pre-superblock behaviour). The block-cache fast + // path is only entered when hooks/DB are off anyway, but a probe or + // mem-watch can be armed alongside it. + let chain_allowed = !kernel.any_probe_active() && !mem.has_mem_watch(); + + let mut block_ptr = first_block_ptr; + let mut pc_before = first_pc_before; + let mut total_executed: u64 = 0; + + let (result, last_block_ptr, last_pc_before) = loop { + let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count; + let mmio_before = mem.mmio_access_count(); + let block = unsafe { &*block_ptr }; + let result = { + let ctx = kernel.scheduler.ctx_mut_ref(thread_ref); + step_block(ctx, mem, block) + }; + let executed = kernel + .scheduler + .ctx_mut_ref(thread_ref) + .cycle_count + .saturating_sub(cycle_before); + total_executed = total_executed.saturating_add(executed); + + // STOP conditions (any → end the superblock, hand to epilogue): + // non-Continue result (let the epilogue apply it), chaining + // disabled, a sync-sensitive block just ran, MMIO was touched, + // or the budget is spent. + if !chain_allowed + || !matches!(result, StepResult::Continue) + || block.sync_sensitive + || mem.mmio_access_count() != mmio_before + || total_executed >= budget + { + break (result, block_ptr, pc_before); + } + + // Decide whether the NEXT PC is an ordinary guest block we can + // chain into. Anything else (thunk / halt sentinel / unmapped) + // needs the full prologue dispatch next round. + let next_pc = kernel.scheduler.ctx(wc.hw_id).pc; + if next_pc == LR_HALT + || (kernel.pc_in_thunk_band(next_pc) && thunk_map.contains_key(&next_pc)) + || !mem.is_mapped(next_pc) + { + break (result, block_ptr, pc_before); + } + + // Chain: build/fetch the next block. Re-borrows `wc.block_cache`, + // which invalidates the previous `block_ptr` — but we've already + // finished using it (only `sync_sensitive`/diagnostics were read, + // above), so the raw-pointer aliasing rule is respected. + pc_before = next_pc; + block_ptr = wc.block_cache.lookup_or_build(next_pc, mem) as *const _; + }; + + worker_epilogue( + wc, + kernel, + debugger, + stats, + tid, + thread_ref, + last_block_ptr, + last_pc_before, + result, + total_executed, + ) +} + #[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))] fn run_execution( mem: &xenia_memory::GuestMemory, @@ -2825,8 +2990,6 @@ fn run_execution( halt_on_deadlock: bool, shutdown: Option>, ) -> ExecStats { - use xenia_cpu::interpreter::step_block; - let mut stats = ExecStats::default(); let _ = quiet; // retained for future per-kind suppression @@ -2974,34 +3137,25 @@ fn run_execution( block_ptr, pc_before, } => { - // Block-cache step. The lockstep path keeps the - // kernel state borrowed straight through (single - // host thread, no contention). Step 03 of the - // M3 real-parallelism plan introduces a - // drop-and-reacquire window around `step_block` - // for the parallel branch. - let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count; - let block = unsafe { &*block_ptr }; - let result = { - let ctx = kernel.scheduler.ctx_mut_ref(thread_ref); - step_block(ctx, mem, block) - }; - let executed = kernel - .scheduler - .ctx_mut_ref(thread_ref) - .cycle_count - .saturating_sub(cycle_before); - match worker_epilogue( + // SUPERBLOCK runner (iterate-3AL). Instead of one + // basic block per slot-visit, chain straight-line + // blocks through their branches up to a deterministic + // instruction budget, yielding back to the round only + // at cross-thread synchronization points. Amortizes + // the per-round (timebase / coord / round_schedule) + // and per-slot (prologue) tax over hundreds of + // instructions instead of ~6. See `run_superblock`. + match run_superblock( wc, kernel, + mem, debugger, + thunk_map, &mut stats, tid, thread_ref, block_ptr, pc_before, - result, - executed, ) { SlotOutcome::Continue => continue, SlotOutcome::BreakOuter => break 'outer, diff --git a/crates/xenia-app/tests/golden/sylpheed_n2m.json b/crates/xenia-app/tests/golden/sylpheed_n2m.json index 4beef5f..a1ec7fc 100644 --- a/crates/xenia-app/tests/golden/sylpheed_n2m.json +++ b/crates/xenia-app/tests/golden/sylpheed_n2m.json @@ -1,5 +1,5 @@ { - "instructions": 2000005, + "instructions": 2000073, "imports": 5635, "unimpl": 0, "draws": 0, diff --git a/crates/xenia-app/tests/golden/sylpheed_n50m.json b/crates/xenia-app/tests/golden/sylpheed_n50m.json index a6e7f75..08f9714 100644 --- a/crates/xenia-app/tests/golden/sylpheed_n50m.json +++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json @@ -1,9 +1,9 @@ { - "instructions": 50000007, - "imports": 333453, + "instructions": 50000110, + "imports": 243387, "unimpl": 0, - "draws": 1274, - "swaps": 259, + "draws": 1279, + "swaps": 260, "unique_render_targets": 2, "shader_blobs_live": 6, "texture_cache_entries": 1 diff --git a/crates/xenia-cpu/src/block_cache.rs b/crates/xenia-cpu/src/block_cache.rs index c4dd5a5..5b4a892 100644 --- a/crates/xenia-cpu/src/block_cache.rs +++ b/crates/xenia-cpu/src/block_cache.rs @@ -79,6 +79,14 @@ pub struct DecodedBlock { /// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk /// pushes the first decoded word unconditionally). pub instrs: Vec, + /// True if this block contains a cross-thread synchronization point + /// (`PpcOpcode::is_sync_sensitive`: reserved load/store or a memory + /// barrier). Computed once at build time. The superblock runner ends + /// the run after executing a sync-sensitive block so the lockstep + /// interleaving stays fine-grained at exactly those points (preserving + /// the cross-thread ordering the 2E/2F/2J boot work depends on), + /// while chaining freely through ordinary straight-line blocks. + pub sync_sensitive: bool, } /// Per-slot status from a `lookup_or_build` probe. Internal only. @@ -187,11 +195,13 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco let mut instrs: Vec = Vec::with_capacity(8); let page_base = start_pc & GUEST_PAGE_MASK; let mut cur = start_pc; + let mut sync_sensitive = false; loop { let raw = mem.read_u32(cur); let decoded = decode(raw, cur); let terminates = decoded.opcode.terminates_block(); + sync_sensitive |= decoded.opcode.is_sync_sensitive(); instrs.push(decoded); if terminates { @@ -215,6 +225,7 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco end_pc, page_version, instrs, + sync_sensitive, } } @@ -335,6 +346,40 @@ mod tests { assert_eq!(b.end_pc, 0x110); } + #[test] + fn sync_sensitive_flag_set_for_barrier_block() { + // A block containing `sync` (0x7C0004AC) must flag sync_sensitive + // so the superblock runner ends the chain there (cross-thread + // ordering point). `sync` does NOT terminate a block, so it sits + // mid-block followed by straight-line code up to a terminator. + let mem = BlockTestMem::new(); + mem.put(0x100, enc_addi(3, 3, 1)); + mem.put(0x104, 0x7C00_04AC); // sync + mem.put(0x108, enc_addi(3, 3, 1)); + mem.put(0x10C, enc_b_self()); // terminator + let mut bc = BlockCache::new(); + let b = bc.lookup_or_build(0x100, &mem); + assert!( + b.sync_sensitive, + "block containing `sync` must flag sync_sensitive; decoded last={:?}", + b.instrs.iter().map(|i| i.opcode).collect::>() + ); + } + + #[test] + fn sync_sensitive_flag_clear_for_plain_block() { + // A straight-line ALU block with no reserved-op / barrier must + // NOT flag sync_sensitive (so the superblock runner is free to + // chain through it). + let mem = BlockTestMem::new(); + mem.put(0x100, enc_addi(3, 3, 1)); + mem.put(0x104, enc_addi(3, 3, 1)); + mem.put(0x108, enc_b_self()); + let mut bc = BlockCache::new(); + let b = bc.lookup_or_build(0x100, &mem); + assert!(!b.sync_sensitive, "plain ALU block must not flag sync_sensitive"); + } + #[test] fn block_stops_at_page_boundary() { // Build from 0x1FFC. The next PC (0x2000) is in a different diff --git a/crates/xenia-cpu/src/opcode.rs b/crates/xenia-cpu/src/opcode.rs index e3361ec..7ccf120 100644 --- a/crates/xenia-cpu/src/opcode.rs +++ b/crates/xenia-cpu/src/opcode.rs @@ -204,6 +204,34 @@ impl PpcOpcode { ) } + /// Returns true if this opcode is a cross-thread synchronization + /// point at which the superblock runner MUST yield back to the + /// round-robin scheduler so the lockstep interleaving stays + /// fine-grained enough to preserve correct cross-thread ordering: + /// + /// - reserved load/store (`lwarx`/`ldarx`/`stwcx.`/`stdcx.`): the + /// atomic primitive other threads race on. Running past one + /// without returning to the scheduler would let a single slot + /// win/lose a reservation across many blocks before any peer + /// observes it. + /// - memory barriers (`sync`/`eieio`/`isync`): the guest explicitly + /// demands a global ordering point here; honour it by ending the + /// superblock so the scheduler re-interleaves. + /// + /// Purely a function of the opcode (no guest data), so the yield + /// decision is deterministic and the schedule reproduces byte-identically. + /// Note: `sc` (syscall) and traps already `terminates_block`, and + /// import-thunk / halt-sentinel PCs are handled by the per-block + /// prologue re-check in the superblock loop — they are not listed here. + #[inline] + pub fn is_sync_sensitive(&self) -> bool { + matches!( + self, + Self::lwarx | Self::ldarx | Self::stwcx | Self::stdcx + | Self::sync | Self::eieio | Self::isync + ) + } + pub fn name(&self) -> &'static str { match self { Self::Invalid => "invalid", diff --git a/crates/xenia-memory/src/heap.rs b/crates/xenia-memory/src/heap.rs index 7e107cb..e500916 100644 --- a/crates/xenia-memory/src/heap.rs +++ b/crates/xenia-memory/src/heap.rs @@ -89,6 +89,14 @@ pub struct GuestMemory { mem_watch_addrs: Vec, /// Count of fires observed (for tests / hand-off telemetry). mem_watch_count: AtomicU64, + /// Monotonic count of MMIO accesses (every scalar load/store that + /// resolves to a registered MMIO region bumps this by 1). A pure, + /// deterministic function of guest execution — the superblock runner + /// samples it before/after each block to detect an MMIO touch and + /// end the run there (so MMIO ordering vs other HW threads stays at + /// the same fine lockstep granularity as before). Relaxed because the + /// lockstep path is single-threaded and only needs monotonicity. + mmio_access_count: AtomicU64, } /// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit @@ -133,9 +141,26 @@ impl GuestMemory { writes_total: AtomicU64::new(0), mem_watch_addrs: Vec::new(), mem_watch_count: AtomicU64::new(0), + mmio_access_count: AtomicU64::new(0), }) } + /// Monotonic count of MMIO accesses since boot. Used by the superblock + /// runner to detect that a just-executed block touched MMIO (so it can + /// end the superblock there and keep MMIO ordering at lockstep + /// granularity). Deterministic function of guest execution. + #[inline] + pub fn mmio_access_count(&self) -> u64 { + self.mmio_access_count + .load(std::sync::atomic::Ordering::Relaxed) + } + + #[inline] + fn bump_mmio_access(&self) { + self.mmio_access_count + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + } + /// Current version watermark for the page containing `addr`. Bumped by /// any write through `write_u8/16/32/64`. Not affected by MMIO writes /// (those don't touch the backing texture memory). @@ -488,6 +513,7 @@ impl MemoryAccess for GuestMemory { // MMIO dispatch must come first — a byte read at an MMIO-mapped // address should invoke the callback, not the backing memory. if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); return (mmio.read_callback)(addr) as u8; } if !self.is_mapped(addr) { return 0; } @@ -498,6 +524,7 @@ impl MemoryAccess for GuestMemory { #[inline] fn read_u16(&self, addr: u32) -> u16 { if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); (mmio.read_callback)(addr) as u16 } else if !self.is_mapped(addr) { 0 @@ -510,6 +537,7 @@ impl MemoryAccess for GuestMemory { #[inline] fn read_u32(&self, addr: u32) -> u32 { if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); (mmio.read_callback)(addr) } else if !self.is_mapped(addr) { 0 @@ -522,6 +550,7 @@ impl MemoryAccess for GuestMemory { #[inline] fn read_u64(&self, addr: u32) -> u64 { if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); let hi = (mmio.read_callback)(addr) as u64; let lo = (mmio.read_callback)(addr.wrapping_add(4)) as u64; (hi << 32) | lo @@ -537,6 +566,7 @@ impl MemoryAccess for GuestMemory { // MMIO dispatch first — a byte write at an MMIO-mapped address // must invoke the callback, not the backing memory. if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); (mmio.write_callback)(addr, val as u32); return; } @@ -555,6 +585,7 @@ impl MemoryAccess for GuestMemory { fn write_u16(&self, addr: u32, val: u16) { if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); (mmio.write_callback)(addr, val as u32); } else if !self.is_mapped(addr) { } else { @@ -577,6 +608,7 @@ impl MemoryAccess for GuestMemory { fn write_u32(&self, addr: u32, val: u32) { if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); (mmio.write_callback)(addr, val); } else if !self.is_mapped(addr) { } else { @@ -596,6 +628,7 @@ impl MemoryAccess for GuestMemory { fn write_u64(&self, addr: u32, val: u64) { if let Some(mmio) = self.find_mmio(addr) { + self.bump_mmio_access(); (mmio.write_callback)(addr, (val >> 32) as u32); (mmio.write_callback)(addr.wrapping_add(4), val as u32); } else if !self.is_mapped(addr) {