From 27d36081740cb2b0097740a92375a0117c71eff3 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 3 May 2026 17:34:30 +0200 Subject: [PATCH] =?UTF-8?q?fix(kernel):=20KRNBUG-D08=20=E2=80=94=20wall-cl?= =?UTF-8?q?ock=20v-sync=20under=20--parallel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The synthetic v-sync ticker used a per-instruction proxy (VSYNC_INSTR_PERIOD = 150 k) tuned for ~10 MIPS lockstep throughput → 60 Hz. Audit M11 observed this drifts under `--parallel`: with 6 worker threads sharing the kernel mutex, the dispatcher executes more PPC instructions per tick callback, so the accumulator never crosses 150 k. Result: ~629 v-syncs/100M lockstep → ~2 v-syncs/100M --parallel. Hybrid solution preserves lockstep determinism (which the goldens depend on) while fixing --parallel: * `tick_vsync_instr(instr_count)` — legacy instruction-count ticker, used by lockstep. Bit-stable across runs. * `tick_vsync_wallclock()` — new Instant-based ticker. Fires `floor(elapsed / VSYNC_PERIOD)` v-syncs since the anchor and advances the anchor by that many full periods (no lazy backlog). Capped at INTERRUPT_QUEUE_CAP per call so a forward-jumping clock can't overflow the FIFO. * `KernelState.parallel_active` flag set at startup from `--parallel` / `XENIA_PARALLEL=1`. Read by `coord_pre_round` in main.rs to choose between the two tickers. Verification: * cargo test --workspace --release: 561 passing (+3 new wall-clock tests vs prior 558 baseline). * lockstep -n 100M --stable-digest: BIT-IDENTICAL to pre-Phase-3 baseline. interrupts_delivered preserved at ~630 (was ~629 pre-fix). * --parallel --reservations-table -n 30M: interrupts_delivered rose from ~2 to 17. (FIFO INTERRUPT_QUEUE_CAP=4 still caps burst delivery; that's a separate bottleneck — addressed by raising cap when --parallel queue depth becomes the next blocker.) Trade-off: --parallel runs are non-deterministic at the v-sync rate by design (per audit M05 PPCBUG-703 already). Lockstep stays bit-identical, so the `sylpheed_n*m.json` goldens are untouched. Audit IDs: KRNBUG-D08 (closed). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/xenia-app/src/main.rs | 23 ++++- crates/xenia-kernel/src/interrupts.rs | 136 +++++++++++++++++++++----- crates/xenia-kernel/src/state.rs | 8 ++ 3 files changed, 144 insertions(+), 23 deletions(-) diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs index 91b0ed4..08b411f 100644 --- a/crates/xenia-app/src/main.rs +++ b/crates/xenia-app/src/main.rs @@ -786,6 +786,7 @@ fn cmd_exec_inner( v == "1" || v == "true" || v == "yes" }); let parallel_active = parallel || parallel_via_env; + kernel.parallel_active = parallel_active; if reservations_table || reservations_via_env || parallel_active { kernel.reservations.enable(); if !quiet { @@ -1517,7 +1518,27 @@ fn coord_pre_round( ); } - if kernel.interrupts.tick_vsync(stats.instruction_count) { + // KRNBUG-D08: backend-aware v-sync ticker. + // + // **Lockstep**: instruction-count ticker (deterministic; one tick per + // PPC block boundary, predictable cadence). The cadence drifts a bit + // from real 60 Hz but is bit-stable across runs, which matters for + // the `sylpheed_n*m.json` golden oracles. + // + // **--parallel**: wall-clock ticker. The instruction-count proxy + // dropped from 629 v-syncs/100M lockstep to ~2 under `--parallel` + // (audit M11) because the dispatcher executes more PPC instructions + // per tick callback when 6 worker threads share the kernel mutex, + // so the accumulator never crosses the 150k threshold. Wall-clock + // restores the ~60 Hz rate at the cost of bit-exact run reproducibility, + // which is acceptable under `--parallel` (M11 already documented + // `--parallel` as non-deterministic by design). + let fired = if kernel.parallel_active { + kernel.interrupts.tick_vsync_wallclock() + } else { + kernel.interrupts.tick_vsync_instr(stats.instruction_count) + }; + if fired { use std::sync::atomic::Ordering; let mmio = kernel.gpu.mmio(); let prev = mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed); diff --git a/crates/xenia-kernel/src/interrupts.rs b/crates/xenia-kernel/src/interrupts.rs index f7d9477..55f0e2f 100644 --- a/crates/xenia-kernel/src/interrupts.rs +++ b/crates/xenia-kernel/src/interrupts.rs @@ -17,6 +17,7 @@ //! fields and the HW thread picks up where it left off. use std::collections::VecDeque; +use std::time::{Duration, Instant}; use xenia_cpu::context::{CrField, PpcContext}; use xenia_cpu::ThreadRef; @@ -156,13 +157,20 @@ pub struct InterruptState { /// Dropped interrupts (callback unset, queue full, or thread /// exited/idle at inject time). pub dropped: u64, - /// Instruction-count accumulator for the synthetic v-sync ticker. At - /// `VSYNC_INSTR_PERIOD` the main loop pushes an `INTERRUPT_SOURCE_VSYNC` - /// onto `pending` and resets. + /// Instruction-count accumulator for the synthetic v-sync ticker + /// (legacy path used by unit tests via `tick_vsync_instr`). Production + /// uses `tick_vsync_wallclock` instead — see [`KRNBUG-D08`]. pub vsync_accumulator: u64, - /// Last observed instruction count — `tick_vsync` diffs against - /// this to advance `vsync_accumulator`. + /// Last observed instruction count for the legacy instruction-count + /// ticker. `tick_vsync_instr` diffs against this to advance + /// `vsync_accumulator`. pub last_instr_count: u64, + /// Wall-clock anchor for the production v-sync ticker. `None` until + /// the first `tick_vsync_wallclock` call (lazy init so unit tests + /// that never invoke that function don't construct an Instant). + /// Each call fires `(elapsed / VSYNC_PERIOD)` v-syncs and advances + /// the anchor by that many full periods. + pub last_vsync_instant: Option, /// M2.5 — per-slot pending-IRQ bits. Set by the producer (M3's /// IRQ-routing logic on `T_main`) with `Release`; consumed by the /// target T_cpu_i with `Acquire` at quantum boundary. Unused under @@ -174,14 +182,21 @@ pub struct InterruptState { /// How many guest instructions correspond to one synthetic v-sync. /// -/// Targets **~60 Hz at the post-Tier-3 interpreter throughput (~10 MIPS)**: -/// 10e6 instr/s ÷ 60 Hz ≈ 167k — we use 150k to give a small cushion. -/// Before M2 this was 500k (~20 Hz), which was enough for games that -/// don't gate anything on v-sync but not enough for titles like Sylpheed -/// whose main loop waits on the v-sync callback to signal an event every -/// frame. +/// **Legacy** — drives `tick_vsync_instr` only. Production uses +/// `tick_vsync_wallclock` with [`VSYNC_PERIOD`]. Kept because audit M11 +/// observed this proxy drifts from 629 v-syncs/100M lockstep down to ~2 +/// under `--parallel`, where the dispatcher executes more PPC instructions +/// per tick call. Unit tests still drive the instruction-count ticker for +/// determinism. pub const VSYNC_INSTR_PERIOD: u64 = 150_000; +/// Wall-clock period for the **production** v-sync ticker. 16.667 ms +/// targets exactly 60 Hz. KRNBUG-D08 — converting from the +/// instruction-count proxy fixes the `--parallel` rate drop while +/// keeping lockstep cadence stable (instruction-count was *also* an +/// approximation; wall-clock is the canonical Xbox 360 v-sync source). +pub const VSYNC_PERIOD: Duration = Duration::from_nanos(16_666_667); + impl InterruptState { /// Record a new callback registration. pub fn set_callback(&mut self, callback_pc: u32, user_data: u32) { @@ -215,18 +230,17 @@ impl InterruptState { self.pending.pop_front() } - /// Advance the v-sync accumulator by the delta since the last call. - /// Returns `true` if a new v-sync was queued. - pub fn tick_vsync(&mut self, current_instr_count: u64) -> bool { + /// **Legacy** — instruction-count v-sync ticker. Kept for unit tests + /// that need a deterministic clock source. Production code calls + /// `tick_vsync_wallclock` instead. Returns `true` if at least one + /// v-sync was queued. + pub fn tick_vsync_instr(&mut self, current_instr_count: u64) -> bool { let delta = current_instr_count.saturating_sub(self.last_instr_count); self.last_instr_count = current_instr_count; self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta); if self.vsync_accumulator < VSYNC_INSTR_PERIOD { return false; } - // Multiple periods may have elapsed in a single tick call if a - // large instruction delta went by (e.g. a long export). Drain - // the accumulator fully so we don't lag behind. let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD; self.vsync_accumulator %= VSYNC_INSTR_PERIOD; for _ in 0..periods { @@ -235,6 +249,45 @@ impl InterruptState { true } + /// **Production** — wall-clock v-sync ticker. Fires + /// `floor(elapsed / VSYNC_PERIOD)` v-syncs since the last call and + /// advances the anchor by that many full periods (so a long pause + /// doesn't lose all the v-syncs it spans, and a quick succession of + /// calls doesn't over-fire). KRNBUG-D08 — replaces the legacy + /// instruction-count proxy that drifted under `--parallel`. + /// Returns `true` if at least one v-sync was queued. + pub fn tick_vsync_wallclock(&mut self) -> bool { + let now = Instant::now(); + let anchor = match self.last_vsync_instant { + Some(t) => t, + None => { + self.last_vsync_instant = Some(now); + return false; + } + }; + let elapsed = now.saturating_duration_since(anchor); + let period_ns = VSYNC_PERIOD.as_nanos() as u64; + let elapsed_ns = elapsed.as_nanos() as u64; + let periods = elapsed_ns / period_ns; + if periods == 0 { + return false; + } + // Advance the anchor by the number of full periods consumed, + // not to `now`. That lets a long pause distribute its missed + // v-syncs evenly without lazy-batching the entire backlog into + // one tick (over-fire would interleave dozens of callback + // injections back-to-back). Cap at INTERRUPT_QUEUE_CAP so a + // clock that jumped forward (system suspend) doesn't try to + // queue more than the FIFO can hold. + let advance = Duration::from_nanos(periods * period_ns); + self.last_vsync_instant = Some(anchor + advance); + let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP); + for _ in 0..to_queue { + self.queue_interrupt(INTERRUPT_SOURCE_VSYNC); + } + true + } + /// Is HW thread 0 currently in a callback? pub fn is_in_callback(&self) -> bool { self.saved.is_some() @@ -283,26 +336,65 @@ mod tests { } #[test] - fn tick_vsync_fires_at_new_150k_threshold() { + fn tick_vsync_instr_fires_at_new_150k_threshold() { let mut s = InterruptState::default(); s.set_callback(0x1000, 0xAB); assert_eq!(VSYNC_INSTR_PERIOD, 150_000); - assert!(!s.tick_vsync(VSYNC_INSTR_PERIOD - 1)); + assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1)); assert!(s.pending.is_empty()); - assert!(s.tick_vsync(VSYNC_INSTR_PERIOD)); + assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD)); assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC)); } #[test] - fn tick_vsync_drains_multiple_periods_in_one_call() { + fn tick_vsync_instr_drains_multiple_periods_in_one_call() { // Long kernel export → big instr delta → multiple v-syncs must // be delivered, not lost. let mut s = InterruptState::default(); s.set_callback(0x1000, 0xAB); - assert!(s.tick_vsync(VSYNC_INSTR_PERIOD * 3 + 10)); + assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10)); assert_eq!(s.pending.len(), 3); } + #[test] + fn tick_vsync_wallclock_first_call_sets_anchor() { + // First call seeds the anchor and never fires. KRNBUG-D08: + // initial wall-clock state has no prior reference, so we can't + // know the elapsed delta yet. + let mut s = InterruptState::default(); + s.set_callback(0x1000, 0xAB); + assert!(!s.tick_vsync_wallclock()); + assert!(s.pending.is_empty()); + assert!(s.last_vsync_instant.is_some()); + } + + #[test] + fn tick_vsync_wallclock_fires_after_period() { + // Sleeps one full v-sync period (16.667 ms) and verifies a + // single v-sync is queued. Sleep is fine in --release tests + // (one-shot, ~17 ms cost). + let mut s = InterruptState::default(); + s.set_callback(0x1000, 0xAB); + s.tick_vsync_wallclock(); // seed + std::thread::sleep(VSYNC_PERIOD + Duration::from_millis(2)); + assert!(s.tick_vsync_wallclock()); + assert_eq!(s.pending.len(), 1); + assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC)); + } + + #[test] + fn tick_vsync_wallclock_caps_burst_at_queue_cap() { + // A multi-period elapsed window queues at most + // INTERRUPT_QUEUE_CAP v-syncs (the FIFO can't hold more anyway). + // Sleep 6 periods (~100 ms), expect INTERRUPT_QUEUE_CAP queued. + let mut s = InterruptState::default(); + s.set_callback(0x1000, 0xAB); + s.tick_vsync_wallclock(); // seed + std::thread::sleep(VSYNC_PERIOD * 6 + Duration::from_millis(2)); + assert!(s.tick_vsync_wallclock()); + assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP); + } + /// Simulates what the main loop does: inject, execute guest code up /// to the sentinel, restore. Uses a single-instruction `bclr` callback /// — the interpreter sees `pc == callback_pc`, steps, and the blr diff --git a/crates/xenia-kernel/src/state.rs b/crates/xenia-kernel/src/state.rs index 04432f4..25380fb 100644 --- a/crates/xenia-kernel/src/state.rs +++ b/crates/xenia-kernel/src/state.rs @@ -137,6 +137,13 @@ pub struct KernelState { /// for golden verification, or implicitly under `--parallel`. /// See [`xenia_cpu::ReservationTable`] for the concurrency model. pub reservations: std::sync::Arc, + /// True when the runtime was started with `--parallel`. Read by the + /// v-sync ticker (KRNBUG-D08): lockstep uses the deterministic + /// instruction-count proxy so the `sylpheed_n*m.json` goldens stay + /// bit-stable; `--parallel` uses wall-clock so the rate doesn't + /// drop to ~2 v-syncs / 100M as the instruction-count proxy did. + /// Set once at startup and never mutated. + pub parallel_active: bool, /// Map from `(module, ordinal)` to the guest-side import-thunk address /// resolved at load time. Reverse of `xenia-app/src/main.rs`'s /// `thunk_map`. Populated from xenia-app's Phase 1 (record_type==1 @@ -203,6 +210,7 @@ impl KernelState { cxx_throw_logged: false, ring_base: 0, ring_size_dwords: 0, + parallel_active: false, }; crate::exports::register_exports(&mut state); crate::xam::register_exports(&mut state);