1 Commits

Author SHA1 Message Date
MechaCat02
bc37074f9e wip(probe): throwaway iterate-iterate-2AZ-vsync probe instrumentation
Uncommitted experimental probe code preserved for handoff. Per running
memory these probes are inert/throwaway diagnostics, not production fixes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-05 07:19:28 +02:00
5 changed files with 134 additions and 82 deletions

View File

@@ -2451,6 +2451,23 @@ fn coord_pre_round(
// restores the ~60 Hz rate at the cost of bit-exact run reproducibility,
// which is acceptable under `--parallel` (M11 already documented
// `--parallel` as non-deterministic by design).
// 2.AZ — lockstep v-sync clock source.
//
// CORRECTION to the 2.AX framing (this iterate, measured): the lockstep
// ticker's instruction-count clock does NOT freeze after the post-boot
// wedge. `stats.instruction_count` is monotone & global and climbs the
// whole run (reaches the full -n budget) because the "wedge" is not a
// true all-blocked stall — tids 7/8/9/10 stay `Ready` and spin, so
// instructions keep retiring and the ticker keeps crossing the 150k
// threshold (~3 333 crossings @ -n 500M). The measured ~73-v-sync/run
// cap on *delivered* interrupts is the INJECTOR throughput
// (INTERRUPT_QUEUE_CAP=4 + one drain/round in
// `try_inject_graphics_interrupt`), NOT the clock. And even a delivered
// r3==0 VSync ISR never signals Event 0x10e8 — it takes the opt_callback
// `+44` path, a confirmed structural dead-end (2.AV/2.AX). So the
// cadence clock is NOT the wedge gate; the original instruction-count
// source is retained (driving a timebase ticker off `max_timebase`
// PLATEAUS when the lead thread blocks and regresses delivery 73→13).
let fired = if kernel.parallel_active {
kernel.interrupts.tick_vsync_wallclock()
} else {
@@ -2465,20 +2482,10 @@ fn coord_pre_round(
}
if kernel.xaudio_tick_enabled {
let fired = if kernel.parallel_active {
kernel.xaudio.tick_wallclock()
if kernel.parallel_active {
kernel.xaudio.tick_wallclock();
} else {
kernel.xaudio.tick_instr(stats.instruction_count)
};
// AUDIT-2AU Option β: on each audio period, re-signal the XAudio
// render loop's captured frame-event pair (buffer-ready /
// frame-done). Emulates canary's host XAudio2 OnBufferEnd firing
// those events every period; without it ours's render loop
// (tid=11) wedges on its second KeWait forever and starves the
// tid=9/10 mixers + tid=12 DPC downstream (2.AS cascade). Gated
// by the same instruction-count tick => deterministic.
if fired {
xenia_kernel::exports::pulse_xaudio_frame_events(kernel);
kernel.xaudio.tick_instr(stats.instruction_count);
}
}

View File

@@ -1196,6 +1196,26 @@ impl Scheduler {
}
}
/// Maximum guest timebase across every thread in every slot's runqueue
/// (2.AZ). This is the global guest-clock proxy: it advances both when
/// any thread executes (per-instruction `timebase += 1`) and when the
/// idle path jumps the timebase forward to a pending deadline
/// (`advance_all_timebases_to`). Unlike `ctx(hw_id).timebase` — which
/// reads only the *currently scheduled* thread on one slot and therefore
/// stalls whenever that slot's thread is Blocked — the max is monotone
/// across the whole machine, so a v-sync ticker keyed to it keeps
/// advancing even when the slot-0 thread is wedged. Deterministic:
/// derived purely from guest-cycle state, never host wall-clock.
/// Returns 0 when no threads exist.
pub fn max_timebase(&self) -> u64 {
self.slots
.iter()
.flat_map(|slot| slot.runqueue.iter())
.map(|t| t.ctx.timebase)
.max()
.unwrap_or(0)
}
/// Fast-forward the timebase to the earliest pending timed wait and
/// wake that sleeper. Used when a round had no Ready threads and no
/// timer fires closer than the earliest wait. Returns the woken

View File

@@ -5346,27 +5346,6 @@ fn emit_signal_match_if_waiters(
crate::event_log::emit_signal_match(tid, cycle, signal_call, target_handle, n, &tids);
}
/// AUDIT-2AU Option β: re-signal the XAudio render loop's frame-event
/// pair, emulating the host XAudio2 OnBufferEnd callback firing once per
/// audio period. Called from the round prologue gated by the same
/// instruction-count audio cadence that drives `tick_instr`, so timing
/// is deterministic (never host_ns). Mirrors `ke_set_event`'s signal +
/// wake sequence for each captured event handle (see
/// `do_wait_multiple` capture site + `XAudioState::frame_events`).
pub fn pulse_xaudio_frame_events(state: &mut KernelState) {
if state.xaudio.frame_events.is_empty() {
return;
}
let events = state.xaudio.frame_events.clone();
for h in events {
if let Some(KernelObject::Event { signaled, .. }) = state.objects.get_mut(&h) {
*signaled = true;
emit_signal_match_if_waiters(state, "XAudioFramePulse", h);
wake_eligible_waiters(state, h);
}
}
}
fn ke_set_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
// r3 = PKEVENT on Ke* (guest pointer). See `ensure_dispatcher_object`
// for why we need the lazy-shadow step here.
@@ -5673,30 +5652,6 @@ fn do_wait_multiple(
Some(None) => None,
None => None,
};
// AUDIT-2AU Option β: capture the XAudio render loop's frame-event
// pair at the wait site. Sylpheed's render-driver thread (tid=11,
// entry 0x824d2a94 = canary tid=4) blocks here on a WaitAny over two
// guest-address Events (the "buffer ready" manual-reset + "frame
// done" auto-reset pair). In canary these are signaled every audio
// period by the host XAudio2 OnBufferEnd callback; in ours nothing
// signals them after the first fast-path consumes the auto-reset
// member, so the loop wedges forever (2.AL). Record the pair (no
// hardcoded addresses) so the round-prologue audio-cadence ticker
// re-signals them. Discriminator: a *multi*-handle WaitAny whose
// members are all guest-address Events, with at least one XAudio
// client registered — tid=2's lone guest-Event wait goes through
// do_wait_single, so this won't catch it.
if !wait_all
&& handles.len() >= 2
&& state.xaudio.any_registered()
&& handles.iter().all(|&h| {
h >= 0x8000_0000 && matches!(state.objects.get(&h), Some(KernelObject::Event { .. }))
})
{
for &h in &handles {
state.xaudio.note_frame_event(h);
}
}
let current_ref = state.scheduler.current_ref();
for &h in &handles {
handle_enqueue_waiter(state, h, current_ref);

View File

@@ -165,6 +165,15 @@ pub struct InterruptState {
/// ticker. `tick_vsync_instr` diffs against this to advance
/// `vsync_accumulator`.
pub last_instr_count: u64,
/// Last observed guest **timebase** for the deterministic-idle v-sync
/// ticker (`tick_vsync_timebase`, 2.AZ). Distinct accumulator state
/// from `last_instr_count` so the two tickers never alias. The guest
/// timebase advances `+1` per executed instruction during execution
/// (≈ the instruction count) *and* jumps forward in 1 µs units while
/// every thread is wedged (`advance_all_timebases_to` during idle), so
/// diffing it keeps the v-sync cadence moving when the guest stops
/// executing — fixing the lockstep self-stall (ISR dies at cyc 7.46M).
pub last_timebase: u64,
/// Wall-clock anchor for the production v-sync ticker. `None` until
/// the first `tick_vsync_wallclock` call (lazy init so unit tests
/// that never invoke that function don't construct an Instant).
@@ -249,6 +258,52 @@ impl InterruptState {
true
}
/// **Lockstep (2.AZ)** — deterministic v-sync ticker driven off the
/// guest **timebase** instead of `stats.instruction_count`.
///
/// Root cause it fixes: `tick_vsync_instr` diffs `instruction_count`,
/// which is bumped ONLY by real guest execution. Once `tid=1` wedges on
/// Event 0x10e8 and every thread is Blocked/Exited, the lockstep loop
/// executes 0 instructions/round, `instruction_count` freezes, the
/// ticker delta is 0, and the VSync ISR `sub_824be9a0` stops firing
/// after cyc 7.46M (2.AX). Canary sustains 60 Hz forever because its
/// v-sync is host-clock driven, independent of guest CPU progress.
///
/// The guest timebase keeps advancing while the guest is wedged:
/// `coord_idle_advance` jumps it forward (in 1 µs units) to the next
/// timer / wait deadline via `advance_all_timebases_to`. Diffing it
/// therefore keeps queuing v-syncs during the wedge, and the existing
/// `try_inject_graphics_interrupt` Pass-2 delivers them onto a Blocked
/// thread. During *normal* execution the timebase advances ≈ 1:1 with
/// instruction count, so the same `VSYNC_INSTR_PERIOD` (150 000)
/// reproduces the established lockstep cadence — behaviour is
/// continuous across the execute↔idle boundary.
///
/// **Determinism**: the cadence derives purely from the deterministic
/// guest timebase (guest-cycle / µs deadlines), never host wall-clock,
/// so golden oracles stay bit-stable. Reuses the same period constant
/// as the instruction-count ticker for cadence continuity.
pub fn tick_vsync_timebase(&mut self, current_timebase: u64) -> bool {
let delta = current_timebase.saturating_sub(self.last_timebase);
self.last_timebase = current_timebase;
self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
return false;
}
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
// Cap the per-call burst at the FIFO depth: an idle round can jump
// the timebase forward by many periods at once (a far-off deadline),
// and `queue_interrupt` would otherwise drop the overflow silently.
// Bounding the queued count keeps delivery paced one-per-round
// rather than dumping a backlog that the injector can't drain.
let to_queue = periods.min(INTERRUPT_QUEUE_CAP as u64);
for _ in 0..to_queue {
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
}
true
}
/// **Production** — wall-clock v-sync ticker. Fires
/// `floor(elapsed / VSYNC_PERIOD)` v-syncs since the last call and
/// advances the anchor by that many full periods (so a long pause
@@ -356,6 +411,45 @@ mod tests {
assert_eq!(s.pending.len(), 3);
}
#[test]
fn tick_vsync_timebase_fires_at_period_threshold() {
// 2.AZ — timebase-driven lockstep ticker mirrors the
// instruction-count one: a delta < period queues nothing, a delta
// == period queues exactly one v-sync.
let mut s = InterruptState::default();
s.set_callback(0x1000, 0xAB);
assert!(!s.tick_vsync_timebase(VSYNC_INSTR_PERIOD - 1));
assert!(s.pending.is_empty());
assert!(s.tick_vsync_timebase(VSYNC_INSTR_PERIOD));
assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
}
#[test]
fn tick_vsync_timebase_advances_while_guest_wedged() {
// The core 2.AZ fix: even with ZERO executed instructions, an idle
// round jumps the guest timebase forward (µs deadlines). Diffing
// the timebase must still queue the due v-syncs so the ISR keeps
// firing during the wedge. Here the timebase jumps by 2 periods in
// a single call with no intervening "instruction" progress.
let mut s = InterruptState::default();
s.set_callback(0x1000, 0xAB);
assert!(s.tick_vsync_timebase(VSYNC_INSTR_PERIOD * 2));
assert_eq!(s.pending.len(), 2);
}
#[test]
fn tick_vsync_timebase_caps_burst_at_queue_cap() {
// A far-off idle deadline can jump the timebase forward by many
// periods at once; the per-call burst is capped at the FIFO depth
// so the backlog doesn't silently overflow `queue_interrupt`.
let mut s = InterruptState::default();
s.set_callback(0x1000, 0xAB);
let huge = VSYNC_INSTR_PERIOD * (INTERRUPT_QUEUE_CAP as u64 + 50);
assert!(s.tick_vsync_timebase(huge));
assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
assert_eq!(s.dropped, 0, "cap should pre-bound, not drop");
}
#[test]
fn tick_vsync_wallclock_first_call_sets_anchor() {
// First call seeds the anchor and never fires. KRNBUG-D08:

View File

@@ -110,20 +110,6 @@ pub struct XAudioState {
/// `xenia_cpu` (none currently) to keep this self-contained.
pub worker_handles: [Option<u32>; XAUDIO_MAX_CLIENTS],
pub worker_refs: [Option<ThreadRef>; XAUDIO_MAX_CLIENTS],
/// AUDIT-2AU Option β: guest-address Event handles that the XAudio
/// render-driver loop (Sylpheed tid=11, entry 0x824d2a94 = canary
/// tid=4) blocks on via `KeWaitForMultipleObjects(WaitAny)`. These
/// are the per-frame "buffer ready" / "frame done" events that, in
/// canary, are signaled by the host XAudio2 driver's OnBufferEnd
/// callback every audio period. In ours the render loop's *second*
/// KeWait blocks forever because the auto-reset member was consumed
/// by the first fast-path and the manual-reset member is never
/// signaled (2.AL: signal.match on these SIDs = 0 whole-run). We
/// discover the exact handle pair at the wait site (no hardcoded
/// guest addresses) and re-signal them at the audio cadence from the
/// round prologue so the render loop sustains. Deterministic: signal
/// timing is gated by the instruction-count ticker, never host_ns.
pub frame_events: Vec<u32>,
}
impl Default for XAudioState {
@@ -138,7 +124,6 @@ impl Default for XAudioState {
last_instant: None,
worker_handles: [None; XAUDIO_MAX_CLIENTS],
worker_refs: [None; XAUDIO_MAX_CLIENTS],
frame_events: Vec::new(),
}
}
}
@@ -175,15 +160,6 @@ impl XAudioState {
self.clients.iter().any(|c| c.is_some())
}
/// AUDIT-2AU Option β: remember a guest-address Event handle the XAudio
/// render loop blocks on, so the cadence ticker can re-signal it. Dedup
/// to keep the set tiny (Sylpheed's render loop waits on exactly two).
pub fn note_frame_event(&mut self, handle: u32) {
if !self.frame_events.contains(&handle) {
self.frame_events.push(handle);
}
}
fn enqueue_all_active(&mut self) {
for i in 0..XAUDIO_MAX_CLIENTS {
if self.clients[i].is_none() {