diag(audit): KRNBUG-AUDIT-001 — focused parked-waiter ghost-trail diagnostic
Adds a one-run diagnostic that distinguishes "guest never called
Nt/KeSetEvent on this handle" from "signal landed but waiter wasn't
woken", for any handle named via `--trace-handles-focus`.
Parked-waiter context (project_xenia_rs_sylpheed_stage3_2026_04_29):
four worker threads block Sylpheed past `draws=0` on handles
0x1004 / 0x100c / 0x15e4 / 0x42450b5c (mr=true, sig=false). The
pre-existing audit dropped signal-attempts that targeted handles
without a primary trail, so we couldn't tell whether the producer
was unreachable in the guest or whether the signal landed but missed
its waiter.
Three changes:
* audit.rs: `HandleAudit` gains `focus: HashSet<u32>` and
`ghost_trails: HashMap<u32, GhostTrail>`. `record_signal`
auto-falls-through to a new `record_signal_attempt_ghost` when no
primary trail exists AND the handle is in `focus`. Bounded by
AUDIT_RING_CAPACITY per handle. Two new tests cover the focus
ghost-trail and no-double-record invariants.
* main.rs: new `--trace-handles-focus=<LIST>` flag (hex 0x or decimal,
comma-separated) populates `kernel.audit.focus`. Implies
`--trace-handles`. New "=== Handle audit (focus) ===" section in
`dump_thread_diagnostic` emits per-handle:
- signal_attempts (primary + ghost), waits, wakes
- merged cycle-sorted timeline (last 16)
- GuestExport / KernelInternal classification
- <AUDIT_BLIND> marker when waiter_count > 0 but the audit
saw no waits (i.e. waiter parked via a non-audit path —
CS / spinlock / DPC).
- DIAGNOSIS conclusion that selects between five branches.
* `cmd_check` passes None for focus → goldens unaffected.
Empirical run output at -n 500M lockstep with
`--trace-handles-focus=0x1004,0x100c,0x15e4,0x42450b5c`:
handle=0x00001004 kind=Event/Manual waiters=1 signaled=false
signal_attempts=0 (primary=0, ghost=0)
waits=1 wakes=0
created cycle=0 tid=1 lr=0x824a9f6c src=NtCreateEvent
=> producer is a missing kernel signal source
(or BST-paradox upstream)
... (same shape for 0x100c, 0x15e4)
handle=0x42450b5c kind=<UNCREATED> waiters=1 signal_attempts=0
waits=0 wakes=0 <AUDIT_BLIND>
=> waiter parked via non-audited path
Conclusion: hypothesis (A) confirmed for all 4 handles. Producer is
NOT a wake/eligibility bug — it is a genuinely missing kernel signal
source. The 3 Event/Manual handles share a creator
(lr=0x824a9f6c, tid=1) and the same wait-call wrapper at
lr=0x824ac578 — these are 3 worker threads all parked on
"work-available" notifications that never come.
Verification:
* cargo test --workspace --release: 558 passing (+2 new ghost-trail
tests vs prior 556 baseline)
* lockstep -n 100M --stable-digest: bit-identical to master HEAD
Audit IDs: KRNBUG-AUDIT-001 (closed — diagnostic instrumentation).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,7 +16,7 @@
|
||||
//! to identify which kernel API should signal handles
|
||||
//! `0x10FC / 0x1014 / 0x1104 / 0x10DC / 0x10F0` but doesn't.
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
|
||||
/// Maximum events per category per handle. Bounded so a long-running session
|
||||
/// doesn't OOM if a handle is signaled millions of times.
|
||||
@@ -72,10 +72,34 @@ impl HandleAuditTrail {
|
||||
}
|
||||
|
||||
/// The audit table itself. Lives on `KernelState`; opt-in via `enabled`.
|
||||
///
|
||||
/// `focus` + `ghost_trails` form the **parked-waiter diagnostic** added for
|
||||
/// audit-2026-05-fix Phase 2 (KRNBUG-AUDIT-001). When `focus` is non-empty,
|
||||
/// `record_signal_attempt` keeps a "ghost trail" for handles in the focus
|
||||
/// set even if no `record_create` ever observed them — i.e. the guest hand-
|
||||
/// initialized a `KEVENT` (via `KeInitializeEvent` or a raw write) and the
|
||||
/// existing `record_signal` would silently drop the attempt. Ghost trails
|
||||
/// are the only way to distinguish "guest never called Nt/KeSetEvent on
|
||||
/// this handle" from "signal landed but waiter wasn't woken".
|
||||
#[derive(Debug, Default)]
|
||||
pub struct HandleAudit {
|
||||
pub trails: HashMap<u32, HandleAuditTrail>,
|
||||
pub enabled: bool,
|
||||
/// Focus set: when non-empty, signals targeting handles in this set are
|
||||
/// captured even when no `record_create` exists. Populated from
|
||||
/// `--trace-handles=0x1004,0x100c,...`. Empty = whole-table audit.
|
||||
pub focus: HashSet<u32>,
|
||||
/// Ghost trails for never-created handles whose signals we still want
|
||||
/// to see. Keyed by handle. Only populated for handles in `focus`.
|
||||
pub ghost_trails: HashMap<u32, GhostTrail>,
|
||||
}
|
||||
|
||||
/// A ghost trail is a signal-only timeline for a handle that was never
|
||||
/// `record_create`d. We don't have a `kind` because we never saw a creation;
|
||||
/// callers rendering the report should label these as `<UNCREATED>`.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GhostTrail {
|
||||
pub signals: VecDeque<HandleAuditEntry>,
|
||||
}
|
||||
|
||||
impl HandleAudit {
|
||||
@@ -104,7 +128,30 @@ impl HandleAudit {
|
||||
}
|
||||
if let Some(trail) = self.trails.get_mut(&handle) {
|
||||
Self::push_bounded(&mut trail.signals, entry);
|
||||
return;
|
||||
}
|
||||
// No primary trail. Fall through to ghost-trail logic so signals
|
||||
// targeting focus-set handles are not silently dropped.
|
||||
self.record_signal_attempt_ghost(handle, entry);
|
||||
}
|
||||
|
||||
/// Record a signal attempt that targeted a focus-set handle but had no
|
||||
/// primary trail (i.e. the handle was never `record_create`d via one
|
||||
/// of our audit hook sites). Inserts into `ghost_trails`. Bounded by
|
||||
/// `AUDIT_RING_CAPACITY` per handle. No-op when `enabled = false` or
|
||||
/// `handle` is not in `focus`.
|
||||
///
|
||||
/// Public for direct invocation from internal kernel signal sites that
|
||||
/// don't go through `record_signal` (e.g. `signal_io_completion_event`,
|
||||
/// IRQ-callback paths) — those callers should both `record_signal`
|
||||
/// (for the primary-trail case) AND fall through here.
|
||||
#[inline]
|
||||
pub fn record_signal_attempt_ghost(&mut self, handle: u32, entry: HandleAuditEntry) {
|
||||
if !self.enabled || !self.focus.contains(&handle) {
|
||||
return;
|
||||
}
|
||||
let ghost = self.ghost_trails.entry(handle).or_default();
|
||||
Self::push_bounded(&mut ghost.signals, entry);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -167,9 +214,38 @@ mod tests {
|
||||
#[test]
|
||||
fn signal_for_unknown_handle_is_dropped() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
// No `record_create` first → handle has no trail.
|
||||
// No `record_create` first → handle has no trail. Without focus,
|
||||
// the signal is silently dropped (legacy behavior).
|
||||
a.record_signal(0x9999, entry(1, "NtSetEvent"));
|
||||
assert!(a.trails.is_empty());
|
||||
assert!(a.ghost_trails.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn signal_for_focus_handle_lands_in_ghost_trail() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.focus.insert(0x1004);
|
||||
// No `record_create` for 0x1004 — but it's in the focus set.
|
||||
a.record_signal(0x1004, entry(1, "NtSetEvent"));
|
||||
a.record_signal(0x1004, entry(2, "KeSetEvent"));
|
||||
// 0x9999 NOT in focus → still dropped.
|
||||
a.record_signal(0x9999, entry(3, "NtSetEvent"));
|
||||
|
||||
assert!(a.trails.is_empty());
|
||||
let ghost = a.ghost_trails.get(&0x1004).expect("ghost trail expected");
|
||||
assert_eq!(ghost.signals.len(), 2);
|
||||
assert!(!a.ghost_trails.contains_key(&0x9999));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ghost_trail_does_not_double_record_when_primary_exists() {
|
||||
let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
|
||||
a.focus.insert(0x1004);
|
||||
a.record_create(0x1004, "Event/Manual", entry(0, "NtCreateEvent"));
|
||||
a.record_signal(0x1004, entry(1, "NtSetEvent"));
|
||||
|
||||
assert_eq!(a.trails[&0x1004].signals.len(), 1);
|
||||
assert!(a.ghost_trails.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user