fix(kernel): KRNBUG-D08 — wall-clock v-sync under --parallel
The synthetic v-sync ticker used a per-instruction proxy (VSYNC_INSTR_PERIOD = 150 k) tuned for ~10 MIPS lockstep throughput → 60 Hz. Audit M11 observed this drifts under `--parallel`: with 6 worker threads sharing the kernel mutex, the dispatcher executes more PPC instructions per tick callback, so the accumulator never crosses 150 k. Result: ~629 v-syncs/100M lockstep → ~2 v-syncs/100M --parallel. Hybrid solution preserves lockstep determinism (which the goldens depend on) while fixing --parallel: * `tick_vsync_instr(instr_count)` — legacy instruction-count ticker, used by lockstep. Bit-stable across runs. * `tick_vsync_wallclock()` — new Instant-based ticker. Fires `floor(elapsed / VSYNC_PERIOD)` v-syncs since the anchor and advances the anchor by that many full periods (no lazy backlog). Capped at INTERRUPT_QUEUE_CAP per call so a forward-jumping clock can't overflow the FIFO. * `KernelState.parallel_active` flag set at startup from `--parallel` / `XENIA_PARALLEL=1`. Read by `coord_pre_round` in main.rs to choose between the two tickers. Verification: * cargo test --workspace --release: 561 passing (+3 new wall-clock tests vs prior 558 baseline). * lockstep -n 100M --stable-digest: BIT-IDENTICAL to pre-Phase-3 baseline. interrupts_delivered preserved at ~630 (was ~629 pre-fix). * --parallel --reservations-table -n 30M: interrupts_delivered rose from ~2 to 17. (FIFO INTERRUPT_QUEUE_CAP=4 still caps burst delivery; that's a separate bottleneck — addressed by raising cap when --parallel queue depth becomes the next blocker.) Trade-off: --parallel runs are non-deterministic at the v-sync rate by design (per audit M05 PPCBUG-703 already). Lockstep stays bit-identical, so the `sylpheed_n*m.json` goldens are untouched. Audit IDs: KRNBUG-D08 (closed). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -786,6 +786,7 @@ fn cmd_exec_inner(
|
|||||||
v == "1" || v == "true" || v == "yes"
|
v == "1" || v == "true" || v == "yes"
|
||||||
});
|
});
|
||||||
let parallel_active = parallel || parallel_via_env;
|
let parallel_active = parallel || parallel_via_env;
|
||||||
|
kernel.parallel_active = parallel_active;
|
||||||
if reservations_table || reservations_via_env || parallel_active {
|
if reservations_table || reservations_via_env || parallel_active {
|
||||||
kernel.reservations.enable();
|
kernel.reservations.enable();
|
||||||
if !quiet {
|
if !quiet {
|
||||||
@@ -1517,7 +1518,27 @@ fn coord_pre_round(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if kernel.interrupts.tick_vsync(stats.instruction_count) {
|
// KRNBUG-D08: backend-aware v-sync ticker.
|
||||||
|
//
|
||||||
|
// **Lockstep**: instruction-count ticker (deterministic; one tick per
|
||||||
|
// PPC block boundary, predictable cadence). The cadence drifts a bit
|
||||||
|
// from real 60 Hz but is bit-stable across runs, which matters for
|
||||||
|
// the `sylpheed_n*m.json` golden oracles.
|
||||||
|
//
|
||||||
|
// **--parallel**: wall-clock ticker. The instruction-count proxy
|
||||||
|
// dropped from 629 v-syncs/100M lockstep to ~2 under `--parallel`
|
||||||
|
// (audit M11) because the dispatcher executes more PPC instructions
|
||||||
|
// per tick callback when 6 worker threads share the kernel mutex,
|
||||||
|
// so the accumulator never crosses the 150k threshold. Wall-clock
|
||||||
|
// restores the ~60 Hz rate at the cost of bit-exact run reproducibility,
|
||||||
|
// which is acceptable under `--parallel` (M11 already documented
|
||||||
|
// `--parallel` as non-deterministic by design).
|
||||||
|
let fired = if kernel.parallel_active {
|
||||||
|
kernel.interrupts.tick_vsync_wallclock()
|
||||||
|
} else {
|
||||||
|
kernel.interrupts.tick_vsync_instr(stats.instruction_count)
|
||||||
|
};
|
||||||
|
if fired {
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::atomic::Ordering;
|
||||||
let mmio = kernel.gpu.mmio();
|
let mmio = kernel.gpu.mmio();
|
||||||
let prev = mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed);
|
let prev = mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed);
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
//! fields and the HW thread picks up where it left off.
|
//! fields and the HW thread picks up where it left off.
|
||||||
|
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
use xenia_cpu::context::{CrField, PpcContext};
|
use xenia_cpu::context::{CrField, PpcContext};
|
||||||
use xenia_cpu::ThreadRef;
|
use xenia_cpu::ThreadRef;
|
||||||
@@ -156,13 +157,20 @@ pub struct InterruptState {
|
|||||||
/// Dropped interrupts (callback unset, queue full, or thread
|
/// Dropped interrupts (callback unset, queue full, or thread
|
||||||
/// exited/idle at inject time).
|
/// exited/idle at inject time).
|
||||||
pub dropped: u64,
|
pub dropped: u64,
|
||||||
/// Instruction-count accumulator for the synthetic v-sync ticker. At
|
/// Instruction-count accumulator for the synthetic v-sync ticker
|
||||||
/// `VSYNC_INSTR_PERIOD` the main loop pushes an `INTERRUPT_SOURCE_VSYNC`
|
/// (legacy path used by unit tests via `tick_vsync_instr`). Production
|
||||||
/// onto `pending` and resets.
|
/// uses `tick_vsync_wallclock` instead — see [`KRNBUG-D08`].
|
||||||
pub vsync_accumulator: u64,
|
pub vsync_accumulator: u64,
|
||||||
/// Last observed instruction count — `tick_vsync` diffs against
|
/// Last observed instruction count for the legacy instruction-count
|
||||||
/// this to advance `vsync_accumulator`.
|
/// ticker. `tick_vsync_instr` diffs against this to advance
|
||||||
|
/// `vsync_accumulator`.
|
||||||
pub last_instr_count: u64,
|
pub last_instr_count: u64,
|
||||||
|
/// Wall-clock anchor for the production v-sync ticker. `None` until
|
||||||
|
/// the first `tick_vsync_wallclock` call (lazy init so unit tests
|
||||||
|
/// that never invoke that function don't construct an Instant).
|
||||||
|
/// Each call fires `(elapsed / VSYNC_PERIOD)` v-syncs and advances
|
||||||
|
/// the anchor by that many full periods.
|
||||||
|
pub last_vsync_instant: Option<Instant>,
|
||||||
/// M2.5 — per-slot pending-IRQ bits. Set by the producer (M3's
|
/// M2.5 — per-slot pending-IRQ bits. Set by the producer (M3's
|
||||||
/// IRQ-routing logic on `T_main`) with `Release`; consumed by the
|
/// IRQ-routing logic on `T_main`) with `Release`; consumed by the
|
||||||
/// target T_cpu_i with `Acquire` at quantum boundary. Unused under
|
/// target T_cpu_i with `Acquire` at quantum boundary. Unused under
|
||||||
@@ -174,14 +182,21 @@ pub struct InterruptState {
|
|||||||
|
|
||||||
/// How many guest instructions correspond to one synthetic v-sync.
|
/// How many guest instructions correspond to one synthetic v-sync.
|
||||||
///
|
///
|
||||||
/// Targets **~60 Hz at the post-Tier-3 interpreter throughput (~10 MIPS)**:
|
/// **Legacy** — drives `tick_vsync_instr` only. Production uses
|
||||||
/// 10e6 instr/s ÷ 60 Hz ≈ 167k — we use 150k to give a small cushion.
|
/// `tick_vsync_wallclock` with [`VSYNC_PERIOD`]. Kept because audit M11
|
||||||
/// Before M2 this was 500k (~20 Hz), which was enough for games that
|
/// observed this proxy drifts from 629 v-syncs/100M lockstep down to ~2
|
||||||
/// don't gate anything on v-sync but not enough for titles like Sylpheed
|
/// under `--parallel`, where the dispatcher executes more PPC instructions
|
||||||
/// whose main loop waits on the v-sync callback to signal an event every
|
/// per tick call. Unit tests still drive the instruction-count ticker for
|
||||||
/// frame.
|
/// determinism.
|
||||||
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
|
pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
|
||||||
|
|
||||||
|
/// Wall-clock period for the **production** v-sync ticker. 16.667 ms
|
||||||
|
/// targets exactly 60 Hz. KRNBUG-D08 — converting from the
|
||||||
|
/// instruction-count proxy fixes the `--parallel` rate drop while
|
||||||
|
/// keeping lockstep cadence stable (instruction-count was *also* an
|
||||||
|
/// approximation; wall-clock is the canonical Xbox 360 v-sync source).
|
||||||
|
pub const VSYNC_PERIOD: Duration = Duration::from_nanos(16_666_667);
|
||||||
|
|
||||||
impl InterruptState {
|
impl InterruptState {
|
||||||
/// Record a new callback registration.
|
/// Record a new callback registration.
|
||||||
pub fn set_callback(&mut self, callback_pc: u32, user_data: u32) {
|
pub fn set_callback(&mut self, callback_pc: u32, user_data: u32) {
|
||||||
@@ -215,18 +230,17 @@ impl InterruptState {
|
|||||||
self.pending.pop_front()
|
self.pending.pop_front()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Advance the v-sync accumulator by the delta since the last call.
|
/// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
|
||||||
/// Returns `true` if a new v-sync was queued.
|
/// that need a deterministic clock source. Production code calls
|
||||||
pub fn tick_vsync(&mut self, current_instr_count: u64) -> bool {
|
/// `tick_vsync_wallclock` instead. Returns `true` if at least one
|
||||||
|
/// v-sync was queued.
|
||||||
|
pub fn tick_vsync_instr(&mut self, current_instr_count: u64) -> bool {
|
||||||
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
let delta = current_instr_count.saturating_sub(self.last_instr_count);
|
||||||
self.last_instr_count = current_instr_count;
|
self.last_instr_count = current_instr_count;
|
||||||
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
|
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
|
||||||
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Multiple periods may have elapsed in a single tick call if a
|
|
||||||
// large instruction delta went by (e.g. a long export). Drain
|
|
||||||
// the accumulator fully so we don't lag behind.
|
|
||||||
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
||||||
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
||||||
for _ in 0..periods {
|
for _ in 0..periods {
|
||||||
@@ -235,6 +249,45 @@ impl InterruptState {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// **Production** — wall-clock v-sync ticker. Fires
|
||||||
|
/// `floor(elapsed / VSYNC_PERIOD)` v-syncs since the last call and
|
||||||
|
/// advances the anchor by that many full periods (so a long pause
|
||||||
|
/// doesn't lose all the v-syncs it spans, and a quick succession of
|
||||||
|
/// calls doesn't over-fire). KRNBUG-D08 — replaces the legacy
|
||||||
|
/// instruction-count proxy that drifted under `--parallel`.
|
||||||
|
/// Returns `true` if at least one v-sync was queued.
|
||||||
|
pub fn tick_vsync_wallclock(&mut self) -> bool {
|
||||||
|
let now = Instant::now();
|
||||||
|
let anchor = match self.last_vsync_instant {
|
||||||
|
Some(t) => t,
|
||||||
|
None => {
|
||||||
|
self.last_vsync_instant = Some(now);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let elapsed = now.saturating_duration_since(anchor);
|
||||||
|
let period_ns = VSYNC_PERIOD.as_nanos() as u64;
|
||||||
|
let elapsed_ns = elapsed.as_nanos() as u64;
|
||||||
|
let periods = elapsed_ns / period_ns;
|
||||||
|
if periods == 0 {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Advance the anchor by the number of full periods consumed,
|
||||||
|
// not to `now`. That lets a long pause distribute its missed
|
||||||
|
// v-syncs evenly without lazy-batching the entire backlog into
|
||||||
|
// one tick (over-fire would interleave dozens of callback
|
||||||
|
// injections back-to-back). Cap at INTERRUPT_QUEUE_CAP so a
|
||||||
|
// clock that jumped forward (system suspend) doesn't try to
|
||||||
|
// queue more than the FIFO can hold.
|
||||||
|
let advance = Duration::from_nanos(periods * period_ns);
|
||||||
|
self.last_vsync_instant = Some(anchor + advance);
|
||||||
|
let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP);
|
||||||
|
for _ in 0..to_queue {
|
||||||
|
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
/// Is HW thread 0 currently in a callback?
|
/// Is HW thread 0 currently in a callback?
|
||||||
pub fn is_in_callback(&self) -> bool {
|
pub fn is_in_callback(&self) -> bool {
|
||||||
self.saved.is_some()
|
self.saved.is_some()
|
||||||
@@ -283,26 +336,65 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tick_vsync_fires_at_new_150k_threshold() {
|
fn tick_vsync_instr_fires_at_new_150k_threshold() {
|
||||||
let mut s = InterruptState::default();
|
let mut s = InterruptState::default();
|
||||||
s.set_callback(0x1000, 0xAB);
|
s.set_callback(0x1000, 0xAB);
|
||||||
assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
|
assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
|
||||||
assert!(!s.tick_vsync(VSYNC_INSTR_PERIOD - 1));
|
assert!(!s.tick_vsync_instr(VSYNC_INSTR_PERIOD - 1));
|
||||||
assert!(s.pending.is_empty());
|
assert!(s.pending.is_empty());
|
||||||
assert!(s.tick_vsync(VSYNC_INSTR_PERIOD));
|
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD));
|
||||||
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tick_vsync_drains_multiple_periods_in_one_call() {
|
fn tick_vsync_instr_drains_multiple_periods_in_one_call() {
|
||||||
// Long kernel export → big instr delta → multiple v-syncs must
|
// Long kernel export → big instr delta → multiple v-syncs must
|
||||||
// be delivered, not lost.
|
// be delivered, not lost.
|
||||||
let mut s = InterruptState::default();
|
let mut s = InterruptState::default();
|
||||||
s.set_callback(0x1000, 0xAB);
|
s.set_callback(0x1000, 0xAB);
|
||||||
assert!(s.tick_vsync(VSYNC_INSTR_PERIOD * 3 + 10));
|
assert!(s.tick_vsync_instr(VSYNC_INSTR_PERIOD * 3 + 10));
|
||||||
assert_eq!(s.pending.len(), 3);
|
assert_eq!(s.pending.len(), 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tick_vsync_wallclock_first_call_sets_anchor() {
|
||||||
|
// First call seeds the anchor and never fires. KRNBUG-D08:
|
||||||
|
// initial wall-clock state has no prior reference, so we can't
|
||||||
|
// know the elapsed delta yet.
|
||||||
|
let mut s = InterruptState::default();
|
||||||
|
s.set_callback(0x1000, 0xAB);
|
||||||
|
assert!(!s.tick_vsync_wallclock());
|
||||||
|
assert!(s.pending.is_empty());
|
||||||
|
assert!(s.last_vsync_instant.is_some());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tick_vsync_wallclock_fires_after_period() {
|
||||||
|
// Sleeps one full v-sync period (16.667 ms) and verifies a
|
||||||
|
// single v-sync is queued. Sleep is fine in --release tests
|
||||||
|
// (one-shot, ~17 ms cost).
|
||||||
|
let mut s = InterruptState::default();
|
||||||
|
s.set_callback(0x1000, 0xAB);
|
||||||
|
s.tick_vsync_wallclock(); // seed
|
||||||
|
std::thread::sleep(VSYNC_PERIOD + Duration::from_millis(2));
|
||||||
|
assert!(s.tick_vsync_wallclock());
|
||||||
|
assert_eq!(s.pending.len(), 1);
|
||||||
|
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tick_vsync_wallclock_caps_burst_at_queue_cap() {
|
||||||
|
// A multi-period elapsed window queues at most
|
||||||
|
// INTERRUPT_QUEUE_CAP v-syncs (the FIFO can't hold more anyway).
|
||||||
|
// Sleep 6 periods (~100 ms), expect INTERRUPT_QUEUE_CAP queued.
|
||||||
|
let mut s = InterruptState::default();
|
||||||
|
s.set_callback(0x1000, 0xAB);
|
||||||
|
s.tick_vsync_wallclock(); // seed
|
||||||
|
std::thread::sleep(VSYNC_PERIOD * 6 + Duration::from_millis(2));
|
||||||
|
assert!(s.tick_vsync_wallclock());
|
||||||
|
assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
|
||||||
|
}
|
||||||
|
|
||||||
/// Simulates what the main loop does: inject, execute guest code up
|
/// Simulates what the main loop does: inject, execute guest code up
|
||||||
/// to the sentinel, restore. Uses a single-instruction `bclr` callback
|
/// to the sentinel, restore. Uses a single-instruction `bclr` callback
|
||||||
/// — the interpreter sees `pc == callback_pc`, steps, and the blr
|
/// — the interpreter sees `pc == callback_pc`, steps, and the blr
|
||||||
|
|||||||
@@ -137,6 +137,13 @@ pub struct KernelState {
|
|||||||
/// for golden verification, or implicitly under `--parallel`.
|
/// for golden verification, or implicitly under `--parallel`.
|
||||||
/// See [`xenia_cpu::ReservationTable`] for the concurrency model.
|
/// See [`xenia_cpu::ReservationTable`] for the concurrency model.
|
||||||
pub reservations: std::sync::Arc<xenia_cpu::ReservationTable>,
|
pub reservations: std::sync::Arc<xenia_cpu::ReservationTable>,
|
||||||
|
/// True when the runtime was started with `--parallel`. Read by the
|
||||||
|
/// v-sync ticker (KRNBUG-D08): lockstep uses the deterministic
|
||||||
|
/// instruction-count proxy so the `sylpheed_n*m.json` goldens stay
|
||||||
|
/// bit-stable; `--parallel` uses wall-clock so the rate doesn't
|
||||||
|
/// drop to ~2 v-syncs / 100M as the instruction-count proxy did.
|
||||||
|
/// Set once at startup and never mutated.
|
||||||
|
pub parallel_active: bool,
|
||||||
/// Map from `(module, ordinal)` to the guest-side import-thunk address
|
/// Map from `(module, ordinal)` to the guest-side import-thunk address
|
||||||
/// resolved at load time. Reverse of `xenia-app/src/main.rs`'s
|
/// resolved at load time. Reverse of `xenia-app/src/main.rs`'s
|
||||||
/// `thunk_map`. Populated from xenia-app's Phase 1 (record_type==1
|
/// `thunk_map`. Populated from xenia-app's Phase 1 (record_type==1
|
||||||
@@ -203,6 +210,7 @@ impl KernelState {
|
|||||||
cxx_throw_logged: false,
|
cxx_throw_logged: false,
|
||||||
ring_base: 0,
|
ring_base: 0,
|
||||||
ring_size_dwords: 0,
|
ring_size_dwords: 0,
|
||||||
|
parallel_active: false,
|
||||||
};
|
};
|
||||||
crate::exports::register_exports(&mut state);
|
crate::exports::register_exports(&mut state);
|
||||||
crate::xam::register_exports(&mut state);
|
crate::xam::register_exports(&mut state);
|
||||||
|
|||||||
Reference in New Issue
Block a user