[iterate-3AK] Perf quick-wins: ~21% faster boot-to-splash (22→27 MIPS)
Profile-driven low-risk optimizations attacking the ~48% per-block / per-round host-bookkeeping tax found by the callgrind profile. Measured on the bounded headless workload `check -n 100000000 --gpu-inline`: baseline ~4490 ms (22.3 MIPS) -> ~3700 ms (27.0 MIPS), +21%. Tier A (determinism-neutral; n50m golden byte-IDENTICAL, exit 0): 1. mem-watch write path: gate capture_mem_watch_old/check_mem_watch behind one has_mem_watch() predicted branch in write_u8/16/32/64 + write_bulk so the common (no-watch) store does no out-of-line call. check_mem_watch (4.8%) gone from the profile. 2. round-schedule alloc churn: add Scheduler::round_schedule_into filling a reusable [u8; HW_THREAD_COUNT] stack buffer; the lockstep round loop no longer __rust_alloc/__rust_dealloc a Vec<u8> per round. Identical ordering/RNG-advance. __rust_alloc/dealloc gone from the profile. 3. probe-firing: hoist a single KernelState::any_probe_active() guard to worker_prologue so the four fire_*_if_match calls don't happen at all when no probe is configured (was 4x call overhead/visit). All four gone from the profile. 4. thunk-map hash: range-reject pc against the registered import-thunk address band (KernelState::pc_in_thunk_band, two int compares) before the thunk_map.get(&pc) HashMap lookup. hash_one (4.3%) gone. Tier B (#5, time-granularity change — LANDED, no re-baseline needed): 5. update_timestamp_bundle: throttle to a 0.25 ms quantum (only re-write the KeTimeStampBundle when the deterministic clock advanced >= 2500 units). Inclusive cost 8.65% -> 1.08%. The quantum is far below the 1 ms granularity any guest deadline math needs (tick_count stays fresh; the hub gate is +66 ms; the fade-in is vsync-counter driven per 3AH, not this bundle). VERIFIED: n50m stable digest BYTE-IDENTICAL to the existing golden (so no re-baseline), 150M boot reaches the splash (draws=7415, swaps=2172, gpu.texture.decode{K8888}=448, RTs=2 — all match the post-3AJ baseline), 688 tests green, release n50m oracle ok. Remaining headroom: interpreter::execute (13%), decrement_quantum (8%), step_block (7%) are now the top self-costs — the structural superblock/ JIT lever is the next step for the larger gain. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2459,10 +2459,19 @@ fn worker_prologue(
|
||||
// and println one record. Read-only; lockstep digest unaffected.
|
||||
// Empty set is the common case → single `is_empty()` test inside
|
||||
// the helper, no overhead on the hot path.
|
||||
// Perf (Tier-A #3): all four `fire_*_if_match` helpers early-return
|
||||
// on an empty registry, but paying 4× call overhead per slot-visit
|
||||
// (~3.2M visits boot-to-splash) is itself measurable. Gate the whole
|
||||
// group behind a single `any_probe_active()` predicted branch so the
|
||||
// common (no-probe) headless path never even makes the calls. When a
|
||||
// probe IS configured each helper still re-checks its own set, so
|
||||
// behaviour is identical either way.
|
||||
if kernel.any_probe_active() {
|
||||
kernel.fire_ctor_probe_if_match(hw_id, mem);
|
||||
kernel.fire_branch_probe_if_match(hw_id);
|
||||
kernel.fire_audit_pc_probe_if_match(hw_id, mem);
|
||||
kernel.fire_lr_trace_if_match(hw_id);
|
||||
}
|
||||
|
||||
if mem.has_mem_watch() {
|
||||
let ctx = kernel.scheduler.ctx(hw_id);
|
||||
@@ -2528,8 +2537,15 @@ fn worker_prologue(
|
||||
return PrologueOutcome::Continue;
|
||||
}
|
||||
|
||||
// 2) Import thunk intercept.
|
||||
if let Some((module, ordinal, name)) = thunk_map.get(&pc) {
|
||||
// 2) Import thunk intercept. Perf (Tier-A #4): import thunks occupy a
|
||||
// small contiguous address band; the overwhelming majority of executing
|
||||
// PCs are ordinary guest code outside it. Range-reject against the band
|
||||
// (two integer compares) before paying the `thunk_map` hash. Faithful
|
||||
// no-op — any in-band PC still goes through the exact map lookup, and an
|
||||
// out-of-band PC can never be a registered thunk.
|
||||
if kernel.pc_in_thunk_band(pc)
|
||||
&& let Some((module, ordinal, name)) = thunk_map.get(&pc)
|
||||
{
|
||||
let module = *module;
|
||||
let ordinal_u32 = *ordinal as u32;
|
||||
let thunk_pc = pc;
|
||||
@@ -2854,6 +2870,10 @@ fn run_execution(
|
||||
// re-decoding the same handful of pages 60×/s.
|
||||
let mut isr_decode_cache = xenia_cpu::decoder::DecodeCache::new();
|
||||
|
||||
// Tier-A perf #2: reusable buffer for `round_schedule_into` so the round
|
||||
// loop doesn't heap-allocate a `Vec<u8>` every iteration.
|
||||
let mut order_buf = [0u8; xenia_cpu::scheduler::HW_THREAD_COUNT];
|
||||
|
||||
'outer: loop {
|
||||
// Per-round prologue: budget / shutdown / heartbeat / vsync /
|
||||
// timers / audio-interrupt injection. Carved into
|
||||
@@ -2908,10 +2928,12 @@ fn run_execution(
|
||||
thunk_map,
|
||||
);
|
||||
|
||||
// Snapshot round schedule. `round_schedule` also advances rng state
|
||||
// when seeded; mutation is intentional.
|
||||
// Snapshot round schedule. `round_schedule_into` also advances rng
|
||||
// state when seeded; mutation is intentional. Perf (Tier-A #2): fill
|
||||
// a reusable stack array instead of allocating a fresh Vec per round.
|
||||
kernel.scheduler.begin_round();
|
||||
let order = kernel.scheduler.round_schedule();
|
||||
let order_n = kernel.scheduler.round_schedule_into(&mut order_buf);
|
||||
let order = &order_buf[..order_n];
|
||||
|
||||
if order.is_empty() {
|
||||
// No Ready threads — advance time to the earliest pending
|
||||
@@ -2933,7 +2955,7 @@ fn run_execution(
|
||||
// GPU when block dispatch engages.
|
||||
let instrs_at_round_start = stats.instruction_count;
|
||||
|
||||
for hw_id in order {
|
||||
for &hw_id in order {
|
||||
let wc = &mut workers[hw_id as usize];
|
||||
match worker_prologue(
|
||||
wc,
|
||||
|
||||
@@ -795,31 +795,46 @@ impl Scheduler {
|
||||
/// the fast path — zero bits mean no slot has work and the caller
|
||||
/// falls through to `advance_to_next_wake`.
|
||||
pub fn round_schedule(&mut self) -> Vec<u8> {
|
||||
let mut buf = [0u8; HW_THREAD_COUNT];
|
||||
let n = self.round_schedule_into(&mut buf);
|
||||
buf[..n].to_vec()
|
||||
}
|
||||
|
||||
/// Allocation-free variant of [`Self::round_schedule`] (Tier-A perf #2).
|
||||
/// Fills `buf` with the runnable slot ids and returns the count `n`; the
|
||||
/// valid range is `buf[..n]`. The hot scheduler loop (lockstep +
|
||||
/// parallel) calls this with a reusable stack array so it does not
|
||||
/// `__rust_alloc`/`__rust_dealloc` a fresh `Vec` every round (~7 instr
|
||||
/// apart at boot-to-splash → millions of churned allocations). Identical
|
||||
/// ordering / RNG-advance semantics to `round_schedule`, so the schedule
|
||||
/// — and thus the lockstep digest — is byte-for-byte unchanged.
|
||||
pub fn round_schedule_into(&mut self, buf: &mut [u8; HW_THREAD_COUNT]) -> usize {
|
||||
if self.non_empty_runnable == 0 {
|
||||
return Vec::new();
|
||||
return 0;
|
||||
}
|
||||
let start = self.rotation_cursor as usize;
|
||||
let mut out: Vec<u8> = Vec::with_capacity(HW_THREAD_COUNT);
|
||||
let mut n = 0usize;
|
||||
for off in 0..HW_THREAD_COUNT {
|
||||
let i = (start + off) % HW_THREAD_COUNT;
|
||||
if self.non_empty_runnable & (1 << i) != 0 {
|
||||
out.push(i as u8);
|
||||
buf[n] = i as u8;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
// Seeded mode layers a deterministic shuffle on top of the
|
||||
// already-filtered list. Same spawn/wake sequence + same seed ⇒
|
||||
// same schedule (invariant preserved from pre-Axis-1).
|
||||
if let OrderMode::Seeded { .. } = self.order {
|
||||
for i in (1..out.len()).rev() {
|
||||
for i in (1..n).rev() {
|
||||
self.rng_state ^= self.rng_state << 13;
|
||||
self.rng_state ^= self.rng_state >> 7;
|
||||
self.rng_state ^= self.rng_state << 17;
|
||||
let j = (self.rng_state as usize) % (i + 1);
|
||||
out.swap(i, j);
|
||||
buf.swap(i, j);
|
||||
}
|
||||
}
|
||||
self.rotation_cursor = ((start + 1) % HW_THREAD_COUNT) as u8;
|
||||
out
|
||||
n
|
||||
}
|
||||
|
||||
pub fn begin_round(&mut self) {
|
||||
|
||||
@@ -219,6 +219,17 @@ pub struct KernelState {
|
||||
/// only). Used by `xex_get_procedure_address` to resolve ordinals back
|
||||
/// to callable thunks.
|
||||
thunks_by_ordinal: HashMap<(ModuleId, u16), u32>,
|
||||
|
||||
/// Perf (Tier-A #4): inclusive [min, max] guest-address band that
|
||||
/// contains every registered import thunk. Import thunks sit in a
|
||||
/// small contiguous region of the XEX; almost every executing PC is
|
||||
/// ordinary guest code OUTSIDE this band. The per-slot-visit prologue
|
||||
/// looks up `thunk_map.get(&pc)` (a `HashMap<u32,…>` → `hash_one` per
|
||||
/// call, ~3.2M visits boot-to-splash). Range-rejecting against this
|
||||
/// band first turns the common (non-thunk) case into a pair of integer
|
||||
/// compares and skips the hash entirely. `None` until the first thunk
|
||||
/// is registered (no band → reject everything, matching an empty map).
|
||||
thunk_addr_band: Option<(u32, u32)>,
|
||||
/// First-Pixels diagnostic latch. Set the first time
|
||||
/// `RtlRaiseException` fires with code `0xE06D7363` (MSVC C++ throw)
|
||||
/// so the deep stack-walk + `runtime_error` decode in
|
||||
@@ -374,6 +385,15 @@ pub struct KernelState {
|
||||
/// block every round from the deterministic `global_clock` via
|
||||
/// [`Self::update_timestamp_bundle`].
|
||||
pub timestamp_bundle_addr: u32,
|
||||
|
||||
/// Perf (Tier-B #5) throttle state for [`Self::update_timestamp_bundle`].
|
||||
/// Holds the `clock` value at which the bundle was last actually written;
|
||||
/// `u64::MAX` is the "never written" sentinel (forces the first write).
|
||||
/// `AtomicU64` (not `Cell`) so the `&self` update path stays `Sync` for
|
||||
/// the parallel `Arc<Mutex<KernelState>>` usage. Only ever advanced
|
||||
/// forward under the kernel lock, so `Relaxed` ordering is sufficient and
|
||||
/// the sequence is deterministic.
|
||||
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64,
|
||||
}
|
||||
|
||||
/// ITERATE-2C Phase D — one queued auto-signal. `deadline_cycle` is
|
||||
@@ -439,6 +459,7 @@ impl KernelState {
|
||||
audit: HandleAudit::default(),
|
||||
reservations,
|
||||
thunks_by_ordinal: HashMap::new(),
|
||||
thunk_addr_band: None,
|
||||
cxx_throw_logged: false,
|
||||
ring_base: 0,
|
||||
ring_size_dwords: 0,
|
||||
@@ -465,6 +486,7 @@ impl KernelState {
|
||||
last_cycle_hint: 0,
|
||||
silph_autosignal_diag_logged: false,
|
||||
timestamp_bundle_addr: 0,
|
||||
timestamp_bundle_last_clock: std::sync::atomic::AtomicU64::new(u64::MAX),
|
||||
};
|
||||
crate::exports::register_exports(&mut state);
|
||||
crate::xam::register_exports(&mut state);
|
||||
@@ -584,6 +606,25 @@ impl KernelState {
|
||||
/// emits each ordinal once per module).
|
||||
pub fn register_thunk(&mut self, module: ModuleId, ordinal: u16, address: u32) {
|
||||
self.thunks_by_ordinal.insert((module, ordinal), address);
|
||||
// Widen the thunk address band (Tier-A #4) so the hot prologue can
|
||||
// range-reject non-thunk PCs before hashing the thunk map.
|
||||
self.thunk_addr_band = Some(match self.thunk_addr_band {
|
||||
Some((lo, hi)) => (lo.min(address), hi.max(address)),
|
||||
None => (address, address),
|
||||
});
|
||||
}
|
||||
|
||||
/// Perf (Tier-A #4). Cheap pre-filter for the per-slot-visit import-thunk
|
||||
/// dispatch: `false` guarantees `pc` is NOT a registered thunk (so the
|
||||
/// caller can skip the `thunk_map.get(&pc)` hash). `true` means `pc` lies
|
||||
/// within the registered thunk address band and the map must be consulted
|
||||
/// for an exact match. Conservative — never a false negative.
|
||||
#[inline]
|
||||
pub fn pc_in_thunk_band(&self, pc: u32) -> bool {
|
||||
match self.thunk_addr_band {
|
||||
Some((lo, hi)) => pc >= lo && pc <= hi,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve a `(module, ordinal)` to its registered thunk address.
|
||||
@@ -919,6 +960,31 @@ impl KernelState {
|
||||
return;
|
||||
}
|
||||
const INSTRUCTIONS_PER_MS: u64 = 10_000;
|
||||
// Perf (Tier-B #5): the bundle is updated once per scheduler round
|
||||
// (~every 7 retired instructions), but the four guest BE memory
|
||||
// writes are ~8.6% of boot-to-splash. `clock` is the retired-
|
||||
// instruction count, so consecutive rounds rewrite essentially the
|
||||
// same staircase. Throttle to a 0.25 ms quantum: only re-write when
|
||||
// `clock` advanced by >= INSTRUCTIONS_PER_MS/4 (2500 units) since the
|
||||
// last write. This keeps `tick_count` (ms, changes every 10_000
|
||||
// units) ALWAYS fresh and `interrupt_time`/`system_time` monotone at
|
||||
// 0.25 ms granularity — finer than any guest deadline math needs
|
||||
// (`parse_timeout` works in whole ms; the hub gate is `+66 ms`). The
|
||||
// fade-in (3AH-proven vsync-counter driven, NOT this bundle) is
|
||||
// untouched. Throttle threshold is well below 1 ms so no guest-
|
||||
// visible ms boundary is ever skipped.
|
||||
const BUNDLE_QUANTUM: u64 = INSTRUCTIONS_PER_MS / 4; // 2500 units = 0.25 ms
|
||||
{
|
||||
use std::sync::atomic::Ordering;
|
||||
let last = self.timestamp_bundle_last_clock.load(Ordering::Relaxed);
|
||||
// Always allow the first write (last == u64::MAX sentinel) and any
|
||||
// write that crosses the quantum. Never go backwards.
|
||||
if last != u64::MAX && clock < last.saturating_add(BUNDLE_QUANTUM) {
|
||||
return;
|
||||
}
|
||||
self.timestamp_bundle_last_clock
|
||||
.store(clock, Ordering::Relaxed);
|
||||
}
|
||||
// FILETIME epoch base (~2021) so `system_time` is a plausible
|
||||
// absolute wall-clock; matches the constant used by
|
||||
// `ke_query_system_time`. interrupt_time is "since boot" so it
|
||||
@@ -1042,6 +1108,24 @@ impl KernelState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Perf gate (Tier-A quick-win #3). `true` iff any of the four
|
||||
/// per-slot-visit diagnostic probe registries
|
||||
/// (`ctor_probe_pcs` / `branch_probe_pcs` / `audit_pc_probe_pcs`
|
||||
/// / `lr_trace_pcs`) holds at least one PC. The common headless
|
||||
/// run leaves all four empty, so the prologue can skip the four
|
||||
/// `fire_*_if_match` calls entirely with this single predicted
|
||||
/// branch — avoiding 4× call overhead per slot-visit (~3.2M
|
||||
/// visits over boot-to-splash) when no probe is configured.
|
||||
/// Purely a fast-path guard; each `fire_*` still re-checks its own
|
||||
/// set, so behaviour is identical whether or not the caller gates.
|
||||
#[inline]
|
||||
pub fn any_probe_active(&self) -> bool {
|
||||
!self.ctor_probe_pcs.is_empty()
|
||||
|| !self.branch_probe_pcs.is_empty()
|
||||
|| !self.audit_pc_probe_pcs.is_empty()
|
||||
|| !self.lr_trace_pcs.is_empty()
|
||||
}
|
||||
|
||||
/// Diagnostic. If the live PC for HW slot `hw_id` is in
|
||||
/// `self.ctor_probe_pcs`, emit a single `CTOR-PROBE` line with
|
||||
/// the current cycle, tid, hw_id, sp, r3, lr, plus an 8-frame
|
||||
|
||||
@@ -357,7 +357,8 @@ impl GuestMemory {
|
||||
/// from `GuestMemory` without a wider plumbing change).
|
||||
pub fn write_bulk(&self, addr: u32, buf: &[u8]) {
|
||||
let len = buf.len() as u32;
|
||||
let old_lane = self.capture_mem_watch_old(addr, len);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, len) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr, buf.len());
|
||||
@@ -374,7 +375,7 @@ impl GuestMemory {
|
||||
// the page works.
|
||||
self.bump_page_version(page * PAGE_SIZE);
|
||||
}
|
||||
self.check_mem_watch(addr, len, old_lane);
|
||||
if watch { self.check_mem_watch(addr, len, old_lane); }
|
||||
}
|
||||
|
||||
/// Check if a guest address has been allocated/committed. Acquire load
|
||||
@@ -540,11 +541,16 @@ impl MemoryAccess for GuestMemory {
|
||||
return;
|
||||
}
|
||||
if !self.is_mapped(addr) { return; }
|
||||
let old_lane = self.capture_mem_watch_old(addr, 1);
|
||||
// Perf (Tier-A #1): the mem-watch capture/report pair are out-of-line
|
||||
// calls; on the common (no-watch) path each was a real call that
|
||||
// immediately returned. Gate both behind one predicted branch so the
|
||||
// hot store does no call work unless a watch is actually armed.
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 1) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe { *ptr = val };
|
||||
self.bump_page_version(addr);
|
||||
self.check_mem_watch(addr, 1, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 1, old_lane); }
|
||||
}
|
||||
|
||||
fn write_u16(&self, addr: u32, val: u16) {
|
||||
@@ -552,7 +558,8 @@ impl MemoryAccess for GuestMemory {
|
||||
(mmio.write_callback)(addr, val as u32);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 2);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 2) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 2);
|
||||
@@ -564,7 +571,7 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 1) {
|
||||
self.bump_page_version(addr.wrapping_add(1));
|
||||
}
|
||||
self.check_mem_watch(addr, 2, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 2, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -573,7 +580,8 @@ impl MemoryAccess for GuestMemory {
|
||||
(mmio.write_callback)(addr, val);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 4);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 4) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 4);
|
||||
@@ -582,7 +590,7 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 3) {
|
||||
self.bump_page_version(addr.wrapping_add(3));
|
||||
}
|
||||
self.check_mem_watch(addr, 4, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 4, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -592,7 +600,8 @@ impl MemoryAccess for GuestMemory {
|
||||
(mmio.write_callback)(addr.wrapping_add(4), val as u32);
|
||||
} else if !self.is_mapped(addr) {
|
||||
} else {
|
||||
let old_lane = self.capture_mem_watch_old(addr, 8);
|
||||
let watch = self.has_mem_watch();
|
||||
let old_lane = if watch { self.capture_mem_watch_old(addr, 8) } else { None };
|
||||
let ptr = self.translate_virtual_mut(addr);
|
||||
unsafe {
|
||||
std::ptr::copy_nonoverlapping(val.to_be_bytes().as_ptr(), ptr, 8);
|
||||
@@ -601,7 +610,7 @@ impl MemoryAccess for GuestMemory {
|
||||
if (addr & 0xFFF) >= (PAGE_SIZE - 7) {
|
||||
self.bump_page_version(addr.wrapping_add(7));
|
||||
}
|
||||
self.check_mem_watch(addr, 8, old_lane);
|
||||
if watch { self.check_mem_watch(addr, 8, old_lane); }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user