diag(audit): KRNBUG-AUDIT-001 — focused parked-waiter ghost-trail diagnostic
Adds a one-run diagnostic that distinguishes "guest never called
Nt/KeSetEvent on this handle" from "signal landed but waiter wasn't
woken", for any handle named via `--trace-handles-focus`.
Parked-waiter context (project_xenia_rs_sylpheed_stage3_2026_04_29):
four worker threads block Sylpheed past `draws=0` on handles
0x1004 / 0x100c / 0x15e4 / 0x42450b5c (mr=true, sig=false). The
pre-existing audit dropped signal-attempts that targeted handles
without a primary trail, so we couldn't tell whether the producer
was unreachable in the guest or whether the signal landed but missed
its waiter.
Three changes:
* audit.rs: `HandleAudit` gains `focus: HashSet<u32>` and
`ghost_trails: HashMap<u32, GhostTrail>`. `record_signal`
auto-falls-through to a new `record_signal_attempt_ghost` when no
primary trail exists AND the handle is in `focus`. Bounded by
AUDIT_RING_CAPACITY per handle. Two new tests cover the focus
ghost-trail and no-double-record invariants.
* main.rs: new `--trace-handles-focus=<LIST>` flag (hex 0x or decimal,
comma-separated) populates `kernel.audit.focus`. Implies
`--trace-handles`. New "=== Handle audit (focus) ===" section in
`dump_thread_diagnostic` emits per-handle:
- signal_attempts (primary + ghost), waits, wakes
- merged cycle-sorted timeline (last 16)
- GuestExport / KernelInternal classification
- <AUDIT_BLIND> marker when waiter_count > 0 but the audit
saw no waits (i.e. waiter parked via a non-audit path —
CS / spinlock / DPC).
- DIAGNOSIS conclusion that selects between five branches.
* `cmd_check` passes None for focus → goldens unaffected.
Empirical run output at -n 500M lockstep with
`--trace-handles-focus=0x1004,0x100c,0x15e4,0x42450b5c`:
handle=0x00001004 kind=Event/Manual waiters=1 signaled=false
signal_attempts=0 (primary=0, ghost=0)
waits=1 wakes=0
created cycle=0 tid=1 lr=0x824a9f6c src=NtCreateEvent
=> producer is a missing kernel signal source
(or BST-paradox upstream)
... (same shape for 0x100c, 0x15e4)
handle=0x42450b5c kind=<UNCREATED> waiters=1 signal_attempts=0
waits=0 wakes=0 <AUDIT_BLIND>
=> waiter parked via non-audited path
Conclusion: hypothesis (A) confirmed for all 4 handles. Producer is
NOT a wake/eligibility bug — it is a genuinely missing kernel signal
source. The 3 Event/Manual handles share a creator
(lr=0x824a9f6c, tid=1) and the same wait-call wrapper at
lr=0x824ac578 — these are 3 worker threads all parked on
"work-available" notifications that never come.
Verification:
* cargo test --workspace --release: 558 passing (+2 new ghost-trail
tests vs prior 556 baseline)
* lockstep -n 100M --stable-digest: bit-identical to master HEAD
Audit IDs: KRNBUG-AUDIT-001 (closed — diagnostic instrumentation).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -124,6 +124,15 @@ enum Commands {
|
||||
/// `XENIA_TRACE_HANDLES=1`.
|
||||
#[arg(long)]
|
||||
trace_handles: bool,
|
||||
/// Comma-separated list of handles to capture **signal attempts**
|
||||
/// for, even when no `record_create` exists. Distinguishes "guest
|
||||
/// never called Nt/KeSetEvent on this handle" (signal_attempts=0)
|
||||
/// from "signal landed but waiter wasn't woken" (>0). Implies
|
||||
/// `--trace-handles`. Examples:
|
||||
/// `--trace-handles-focus=0x1004,0x100c,0x15e4,0x42450b5c`
|
||||
/// `--trace-handles-focus=4100` (decimal; 0x1004)
|
||||
#[arg(long)]
|
||||
trace_handles_focus: Option<String>,
|
||||
/// Force the threaded GPU backend (default at M1.9). Kept for
|
||||
/// explicit scripted runs; equivalent to omitting both
|
||||
/// `--gpu-inline` and `--ui`. Settable via `XENIA_GPU_THREAD=1`.
|
||||
@@ -301,6 +310,7 @@ fn main() -> Result<()> {
|
||||
ui,
|
||||
halt_on_deadlock,
|
||||
trace_handles,
|
||||
trace_handles_focus,
|
||||
gpu_thread,
|
||||
gpu_inline,
|
||||
reservations_table,
|
||||
@@ -317,6 +327,7 @@ fn main() -> Result<()> {
|
||||
ui,
|
||||
halt_on_deadlock,
|
||||
trace_handles,
|
||||
trace_handles_focus.as_deref(),
|
||||
gpu_thread,
|
||||
gpu_inline,
|
||||
reservations_table,
|
||||
@@ -514,6 +525,7 @@ fn cmd_exec(
|
||||
ui: bool,
|
||||
halt_on_deadlock: bool,
|
||||
trace_handles: bool,
|
||||
trace_handles_focus: Option<&str>,
|
||||
gpu_thread: bool,
|
||||
gpu_inline: bool,
|
||||
reservations_table: bool,
|
||||
@@ -531,6 +543,7 @@ fn cmd_exec(
|
||||
ui,
|
||||
halt_on_deadlock,
|
||||
trace_handles,
|
||||
trace_handles_focus,
|
||||
gpu_thread,
|
||||
gpu_inline,
|
||||
reservations_table,
|
||||
@@ -566,6 +579,7 @@ fn cmd_check(
|
||||
false, // ui — headless
|
||||
false, // halt_on_deadlock — keep the force-wake default for probe/check runs
|
||||
false, // trace_handles — diagnostic only, never wanted on golden runs
|
||||
None, // trace_handles_focus — same
|
||||
gpu_thread,
|
||||
gpu_inline,
|
||||
reservations_table,
|
||||
@@ -588,6 +602,7 @@ fn cmd_exec_inner(
|
||||
ui: bool,
|
||||
halt_on_deadlock: bool,
|
||||
trace_handles: bool,
|
||||
trace_handles_focus: Option<&str>,
|
||||
gpu_thread: bool,
|
||||
gpu_inline: bool,
|
||||
reservations_table: bool,
|
||||
@@ -782,12 +797,50 @@ fn cmd_exec_inner(
|
||||
let audit_via_env = std::env::var("XENIA_TRACE_HANDLES")
|
||||
.map(|v| matches!(v.as_str(), "1" | "true" | "TRUE" | "True"))
|
||||
.unwrap_or(false);
|
||||
if trace_handles || audit_via_env {
|
||||
let trace_handles_active = trace_handles || audit_via_env || trace_handles_focus.is_some();
|
||||
if trace_handles_active {
|
||||
kernel.audit.enabled = true;
|
||||
if !quiet {
|
||||
tracing::info!("handle audit enabled (--trace-handles)");
|
||||
}
|
||||
}
|
||||
// KRNBUG-AUDIT-001: parse `--trace-handles-focus=0x1004,0x100c,...`
|
||||
// into the audit focus set. When non-empty, signal attempts on these
|
||||
// handles are captured into ghost trails even if no `record_create`
|
||||
// exists for them — the data-driven hypothesis fail-fast for
|
||||
// parked-waiter producer-trace.
|
||||
if let Some(list) = trace_handles_focus {
|
||||
for token in list.split(',').map(str::trim).filter(|s| !s.is_empty()) {
|
||||
let parsed = if let Some(hex) = token.strip_prefix("0x").or_else(|| token.strip_prefix("0X")) {
|
||||
u32::from_str_radix(hex, 16)
|
||||
} else {
|
||||
token.parse::<u32>()
|
||||
};
|
||||
match parsed {
|
||||
Ok(h) => {
|
||||
kernel.audit.focus.insert(h);
|
||||
}
|
||||
Err(_) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"invalid handle in --trace-handles-focus: {token:?}"
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
if !quiet && !kernel.audit.focus.is_empty() {
|
||||
let focused: Vec<String> = kernel
|
||||
.audit
|
||||
.focus
|
||||
.iter()
|
||||
.map(|h| format!("{h:#010x}"))
|
||||
.collect();
|
||||
tracing::info!(
|
||||
"handle audit focus: {} ({})",
|
||||
kernel.audit.focus.len(),
|
||||
focused.join(", ")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Install the GPU register aperture MMIO region on the guest memory so
|
||||
// any `0x7FC8xxxx` access routes to our atomic mailbox. Matches canary's
|
||||
@@ -2936,6 +2989,153 @@ fn dump_thread_diagnostic(kernel: &xenia_kernel::KernelState, quiet: bool) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// KRNBUG-AUDIT-001: per-handle FOCUS report. For each handle in
|
||||
// `--trace-handles-focus`, emit a unified diagnosis: how many
|
||||
// signal attempts (primary + ghost) we saw, classified by
|
||||
// producer-class (GuestExport vs KernelInternal vs
|
||||
// InterruptCallback), so we can deterministically distinguish:
|
||||
// - "guest never tried" → producer is a missing kernel signal source
|
||||
// - "signal landed but waiter wasn't woken" → wake-eligibility bug
|
||||
if !kernel.audit.focus.is_empty() {
|
||||
println!("\n=== Handle audit (focus) ===");
|
||||
let mut focus: Vec<u32> = kernel.audit.focus.iter().copied().collect();
|
||||
focus.sort();
|
||||
for h in focus {
|
||||
let waiter_count = kernel
|
||||
.objects
|
||||
.get(&h)
|
||||
.and_then(|o| match o {
|
||||
KernelObject::Event { waiters, .. }
|
||||
| KernelObject::Semaphore { waiters, .. }
|
||||
| KernelObject::Thread { waiters, .. }
|
||||
| KernelObject::Timer { waiters, .. }
|
||||
| KernelObject::Mutex { waiters, .. } => Some(waiters.len()),
|
||||
KernelObject::File { .. } => None,
|
||||
})
|
||||
.unwrap_or(0);
|
||||
let signaled = kernel
|
||||
.objects
|
||||
.get(&h)
|
||||
.and_then(|o| match o {
|
||||
KernelObject::Event { signaled, .. } => Some(*signaled),
|
||||
_ => None,
|
||||
});
|
||||
let primary = kernel.audit.trails.get(&h);
|
||||
let ghost = kernel.audit.ghost_trails.get(&h);
|
||||
let kind_str = primary.map(|t| t.kind).unwrap_or("<UNCREATED>");
|
||||
let primary_signals = primary.map(|t| t.signals.len()).unwrap_or(0);
|
||||
let ghost_signals = ghost.map(|g| g.signals.len()).unwrap_or(0);
|
||||
let total_signals = primary_signals + ghost_signals;
|
||||
let primary_waits = primary.map(|t| t.waits.len()).unwrap_or(0);
|
||||
let primary_wakes = primary.map(|t| t.wakes.len()).unwrap_or(0);
|
||||
let audit_blind =
|
||||
waiter_count > 0 && primary_waits == 0 && primary_wakes == 0;
|
||||
println!(
|
||||
" handle={:#010x} kind={} waiters={} signaled={} signal_attempts={} (primary={}, ghost={}) waits={} wakes={}{}",
|
||||
h,
|
||||
kind_str,
|
||||
waiter_count,
|
||||
signaled
|
||||
.map(|s| if s { "true" } else { "false" })
|
||||
.unwrap_or("?"),
|
||||
total_signals,
|
||||
primary_signals,
|
||||
ghost_signals,
|
||||
primary_waits,
|
||||
primary_wakes,
|
||||
if audit_blind { " <AUDIT_BLIND>" } else { "" },
|
||||
);
|
||||
if let Some(t) = primary {
|
||||
println!(
|
||||
" created cycle={} tid={} lr={:#010x} src={}",
|
||||
t.created.cycle, t.created.tid, t.created.lr, t.created.source,
|
||||
);
|
||||
}
|
||||
// Producer-class classification. Source strings are stable
|
||||
// labels passed to `record_signal` at the export sites.
|
||||
// Anything not in the GuestExport set is treated as
|
||||
// KernelInternal (e.g. signal_io_completion_event).
|
||||
let guest_export_sources = [
|
||||
"KeSetEvent",
|
||||
"NtSetEvent",
|
||||
"KePulseEvent",
|
||||
"NtPulseEvent",
|
||||
"KeReleaseSemaphore",
|
||||
"NtReleaseSemaphore",
|
||||
"NtSignalAndWaitForSingleObjectEx",
|
||||
];
|
||||
let mut guest_export = 0usize;
|
||||
let mut kernel_internal = 0usize;
|
||||
let count_signals = |signals: &std::collections::VecDeque<
|
||||
xenia_kernel::audit::HandleAuditEntry,
|
||||
>,
|
||||
guest_export: &mut usize,
|
||||
kernel_internal: &mut usize| {
|
||||
for e in signals {
|
||||
if guest_export_sources.contains(&e.source) {
|
||||
*guest_export += 1;
|
||||
} else {
|
||||
*kernel_internal += 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
if let Some(p) = primary {
|
||||
count_signals(&p.signals, &mut guest_export, &mut kernel_internal);
|
||||
}
|
||||
if let Some(g) = ghost {
|
||||
count_signals(&g.signals, &mut guest_export, &mut kernel_internal);
|
||||
}
|
||||
|
||||
// Compact merged timeline (cycle-sorted) of waits + signals.
|
||||
let mut tl: Vec<(u64, &str, &xenia_kernel::audit::HandleAuditEntry)> = Vec::new();
|
||||
if let Some(p) = primary {
|
||||
for e in &p.waits {
|
||||
tl.push((e.cycle, "wait", e));
|
||||
}
|
||||
for e in &p.signals {
|
||||
tl.push((e.cycle, "signal", e));
|
||||
}
|
||||
}
|
||||
if let Some(g) = ghost {
|
||||
for e in &g.signals {
|
||||
tl.push((e.cycle, "signal[ghost]", e));
|
||||
}
|
||||
}
|
||||
tl.sort_by_key(|(c, _, _)| *c);
|
||||
if !tl.is_empty() {
|
||||
println!(" timeline (last 16 of {}):", tl.len());
|
||||
let start = tl.len().saturating_sub(16);
|
||||
for (cycle, label, e) in &tl[start..] {
|
||||
println!(
|
||||
" cycle={} tid={} lr={:#010x} src={}[{}] aux={:#x}",
|
||||
cycle, e.tid, e.lr, e.source, label, e.aux,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// -- DIAGNOSIS --
|
||||
println!(" -- DIAGNOSIS --");
|
||||
println!(
|
||||
" GuestExport={} KernelInternal={} waits={}",
|
||||
guest_export, kernel_internal, primary_waits,
|
||||
);
|
||||
let conclusion = if total_signals == 0 && audit_blind {
|
||||
"waiter parked via non-audited path (CS / spinlock / DPC / sync-primitive helper) and no signals — instrument that wait path or check ObReferenceObjectByHandle plumbing"
|
||||
} else if total_signals == 0 && primary_waits > 0 {
|
||||
"producer is a missing kernel signal source (or BST-paradox upstream)"
|
||||
} else if total_signals == 0 && primary_waits == 0 {
|
||||
"no waits and no signals — handle may not be reached during this run"
|
||||
} else if guest_export > 0 && waiter_count > 0 {
|
||||
"signals landed but waiter still parked — wake-eligibility / mr-bit bug"
|
||||
} else if kernel_internal > 0 && waiter_count > 0 {
|
||||
"kernel-internal signal landed but waiter still parked — wake or routing bug"
|
||||
} else {
|
||||
"not stuck — signals consumed correctly"
|
||||
};
|
||||
println!(" => {}", conclusion);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
//! to identify which kernel API should signal handles
|
||||
//! `0x10FC / 0x1014 / 0x1104 / 0x10DC / 0x10F0` but doesn't.
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
|
||||
/// Maximum events per category per handle. Bounded so a long-running session
|
||||
/// doesn't OOM if a handle is signaled millions of times.
|
||||
@@ -72,10 +72,34 @@ impl HandleAuditTrail {
|
||||
}
|
||||
|
||||
/// The audit table itself. Lives on `KernelState`; opt-in via `enabled`.
|
||||
///
|
||||
/// `focus` + `ghost_trails` form the **parked-waiter diagnostic** added for
|
||||
/// audit-2026-05-fix Phase 2 (KRNBUG-AUDIT-001). When `focus` is non-empty,
|
||||
/// `record_signal_attempt` keeps a "ghost trail" for handles in the focus
|
||||
/// set even if no `record_create` ever observed them — i.e. the guest hand-
|
||||
/// initialized a `KEVENT` (via `KeInitializeEvent` or a raw write) and the
|
||||
/// existing `record_signal` would silently drop the attempt. Ghost trails
|
||||
/// are the only way to distinguish "guest never called Nt/KeSetEvent on
|
||||
/// this handle" from "signal landed but waiter wasn't woken".
|
||||
#[derive(Debug, Default)]
|
||||
pub struct HandleAudit {
|
||||
pub trails: HashMap<u32, HandleAuditTrail>,
|
||||
pub enabled: bool,
|
||||
/// Focus set: when non-empty, signals targeting handles in this set are
|
||||
/// captured even when no `record_create` exists. Populated from
|
||||
/// `--trace-handles=0x1004,0x100c,...`. Empty = whole-table audit.
|
||||
pub focus: HashSet<u32>,
|
||||
/// Ghost trails for never-created handles whose signals we still want
|
||||
/// to see. Keyed by handle. Only populated for handles in `focus`.
|
||||
pub ghost_trails: HashMap<u32, GhostTrail>,
|
||||
}
|
||||
|
||||
/// A ghost trail is a signal-only timeline for a handle that was never
|
||||
/// `record_create`d. We don't have a `kind` because we never saw a creation;
|
||||
/// callers rendering the report should label these as `<UNCREATED>`.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GhostTrail {
|
||||
pub signals: VecDeque<HandleAuditEntry>,
|
||||
}
|
||||
|
||||
impl HandleAudit {
|
||||
@@ -104,7 +128,30 @@ impl HandleAudit {
|
||||
}
|
||||
if let Some(trail) = self.trails.get_mut(&handle) {
|
||||
Self::push_bounded(&mut trail.signals, entry);
|
||||
return;
|
||||
}
|
||||
// No primary trail. Fall through to ghost-trail logic so signals
|
||||
// targeting focus-set handles are not silently dropped.
|
||||
self.record_signal_attempt_ghost(handle, entry);
|
||||
}
|
||||
|
||||
/// Record a signal attempt that targeted a focus-set handle but had no
|
||||
/// primary trail (i.e. the handle was never `record_create`d via one
|
||||
/// of our audit hook sites). Inserts into `ghost_trails`. Bounded by
|
||||
/// `AUDIT_RING_CAPACITY` per handle. No-op when `enabled = false` or
|
||||
/// `handle` is not in `focus`.
|
||||
///
|
||||
/// Public for direct invocation from internal kernel signal sites that
|
||||
/// don't go through `record_signal` (e.g. `signal_io_completion_event`,
|
||||
/// IRQ-callback paths) — those callers should both `record_signal`
|
||||
/// (for the primary-trail case) AND fall through here.
|
||||
#[inline]
|
||||
pub fn record_signal_attempt_ghost(&mut self, handle: u32, entry: HandleAuditEntry) {
|
||||
if !self.enabled || !self.focus.contains(&handle) {
|
||||
return;
|
||||
}
|
||||
let ghost = self.ghost_trails.entry(handle).or_default();
|
||||
Self::push_bounded(&mut ghost.signals, entry);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -167,9 +214,38 @@ mod tests {
|
||||
#[test]
|
||||
fn signal_for_unknown_handle_is_dropped() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
// No `record_create` first → handle has no trail.
|
||||
// No `record_create` first → handle has no trail. Without focus,
|
||||
// the signal is silently dropped (legacy behavior).
|
||||
a.record_signal(0x9999, entry(1, "NtSetEvent"));
|
||||
assert!(a.trails.is_empty());
|
||||
assert!(a.ghost_trails.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn signal_for_focus_handle_lands_in_ghost_trail() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.focus.insert(0x1004);
|
||||
// No `record_create` for 0x1004 — but it's in the focus set.
|
||||
a.record_signal(0x1004, entry(1, "NtSetEvent"));
|
||||
a.record_signal(0x1004, entry(2, "KeSetEvent"));
|
||||
// 0x9999 NOT in focus → still dropped.
|
||||
a.record_signal(0x9999, entry(3, "NtSetEvent"));
|
||||
|
||||
assert!(a.trails.is_empty());
|
||||
let ghost = a.ghost_trails.get(&0x1004).expect("ghost trail expected");
|
||||
assert_eq!(ghost.signals.len(), 2);
|
||||
assert!(!a.ghost_trails.contains_key(&0x9999));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ghost_trail_does_not_double_record_when_primary_exists() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.focus.insert(0x1004);
|
||||
a.record_create(0x1004, "Event/Manual", entry(0, "NtCreateEvent"));
|
||||
a.record_signal(0x1004, entry(1, "NtSetEvent"));
|
||||
|
||||
assert_eq!(a.trails[&0x1004].signals.len(), 1);
|
||||
assert!(a.ghost_trails.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user