diff --git a/crates/xenia-kernel/Cargo.toml b/crates/xenia-kernel/Cargo.toml
index 50de041..f9dd134 100644
--- a/crates/xenia-kernel/Cargo.toml
+++ b/crates/xenia-kernel/Cargo.toml
@@ -8,6 +8,10 @@ license.workspace = true
 xenia-types = { workspace = true }
 xenia-memory = { workspace = true }
 xenia-cpu = { workspace = true }
+xenia-vfs = { workspace = true }
+xenia-hid = { workspace = true }
+xenia-gpu = { workspace = true }
 tracing = { workspace = true }
+metrics = { workspace = true }
 thiserror = { workspace = true }
 anyhow = { workspace = true }
diff --git a/crates/xenia-kernel/src/audit.rs b/crates/xenia-kernel/src/audit.rs
new file mode 100644
index 0000000..204dbe8
--- /dev/null
+++ b/crates/xenia-kernel/src/audit.rs
@@ -0,0 +1,195 @@
+//! Per-handle audit trail for diagnosing HLE sync gaps.
+//!
+//! When enabled (via `--trace-handles` / `XENIA_TRACE_HANDLES=1`), the kernel
+//! records every handle's create/signal/wait/wake events into a bounded
+//! ring per handle. `dump_thread_diagnostic` (in `xenia-app`) prints the
+//! trail at end-of-run, which lets a session see *who* signaled (or failed
+//! to signal) a given handle and *who* parked on it.
+//!
+//! The harness is behavior-neutral: when `enabled = false` (the default),
+//! every record method is an `#[inline]` no-op. When enabled, each record
+//! costs an O(1) HashMap probe + a `VecDeque::push_back` with a bounded
+//! `pop_front` to keep memory at ~32 KiB per handle worst case.
+//!
+//! See [project_xenia_rs_scheduler.md] note on the latent
+//! `scheduler.deadlock_recoveries` event during boot — this harness exists
+//! to identify which kernel API should signal handles
+//! `0x10FC / 0x1014 / 0x1104 / 0x10DC / 0x10F0` but doesn't.
+
+use std::collections::{HashMap, VecDeque};
+
+/// Maximum events per category per handle. Bounded so a long-running session
+/// doesn't OOM if a handle is signaled millions of times.
+pub const AUDIT_RING_CAPACITY: usize = 32;
+
+/// One audit record. Captured at the export's call site so `lr` points at
+/// the guest caller (one instruction past the `bl` to the kernel thunk).
+#[derive(Debug, Clone, Copy)]
+pub struct HandleAuditEntry {
+    /// Per-thread timebase tick at the time of the event. Useful for
+    /// ordering events across threads — same units as
+    /// `Scheduler::ctx(0).timebase`.
+    pub cycle: u64,
+    /// Guest thread id (NOT hw_id — `tid` survives migration).
+    pub tid: u32,
+    /// Caller's LR (the guest pc one past the `bl` to the export).
+    pub lr: u32,
+    /// Stable, kernel-internal label naming the source export. e.g.
+    /// "KeSetEvent", "NtSetEvent", "wake_eligible_waiters".
+    pub source: &'static str,
+    /// Free-form auxiliary data. For signals: previous_state. For waits:
+    /// `(alertable, timeout_ns_or_max)` packed. For wakes: `gpr[3]` set.
+    /// Read by callers as needed.
+    pub aux: u64,
+}
+
+/// Per-handle audit trail. Lives in `KernelState::audit.trails`.
+#[derive(Debug)]
+pub struct HandleAuditTrail {
+    /// Stable label: "Event/Manual", "Event/Auto", "Semaphore", "Timer/Manual",
+    /// "Timer/Auto", "Mutant", "Thread". Used for filtering in the dump.
+    pub kind: &'static str,
+    /// When/who/where the handle was minted.
+    pub created: HandleAuditEntry,
+    /// Bounded ring of signal events.
+    pub signals: VecDeque<HandleAuditEntry>,
+    /// Bounded ring of wait-entry events (one per `Wait*` call).
+    pub waits: VecDeque<HandleAuditEntry>,
+    /// Bounded ring of wake events (one per scheduler-side wake).
+    pub wakes: VecDeque<HandleAuditEntry>,
+}
+
+impl HandleAuditTrail {
+    fn new(kind: &'static str, created: HandleAuditEntry) -> Self {
+        Self {
+            kind,
+            created,
+            signals: VecDeque::with_capacity(AUDIT_RING_CAPACITY),
+            waits: VecDeque::with_capacity(AUDIT_RING_CAPACITY),
+            wakes: VecDeque::with_capacity(AUDIT_RING_CAPACITY),
+        }
+    }
+}
+
+/// The audit table itself. Lives on `KernelState`; opt-in via `enabled`.
+#[derive(Debug, Default)]
+pub struct HandleAudit {
+    pub trails: HashMap<u32, HandleAuditTrail>,
+    pub enabled: bool,
+}
+
+impl HandleAudit {
+    /// Push an entry into a bounded ring, dropping the oldest when full.
+    #[inline]
+    fn push_bounded(ring: &mut VecDeque<HandleAuditEntry>, entry: HandleAuditEntry) {
+        if ring.len() == AUDIT_RING_CAPACITY {
+            ring.pop_front();
+        }
+        ring.push_back(entry);
+    }
+
+    #[inline]
+    pub fn record_create(&mut self, handle: u32, kind: &'static str, entry: HandleAuditEntry) {
+        if !self.enabled {
+            return;
+        }
+        self.trails
+            .insert(handle, HandleAuditTrail::new(kind, entry));
+    }
+
+    #[inline]
+    pub fn record_signal(&mut self, handle: u32, entry: HandleAuditEntry) {
+        if !self.enabled {
+            return;
+        }
+        if let Some(trail) = self.trails.get_mut(&handle) {
+            Self::push_bounded(&mut trail.signals, entry);
+        }
+    }
+
+    #[inline]
+    pub fn record_wait(&mut self, handle: u32, entry: HandleAuditEntry) {
+        if !self.enabled {
+            return;
+        }
+        if let Some(trail) = self.trails.get_mut(&handle) {
+            Self::push_bounded(&mut trail.waits, entry);
+        }
+    }
+
+    #[inline]
+    pub fn record_wake(&mut self, handle: u32, entry: HandleAuditEntry) {
+        if !self.enabled {
+            return;
+        }
+        if let Some(trail) = self.trails.get_mut(&handle) {
+            Self::push_bounded(&mut trail.wakes, entry);
+        }
+    }
+
+    /// Convenience: `(signal_count, wait_count, wake_count)` for a handle.
+    /// Returns `None` if no trail exists.
+    pub fn counts(&self, handle: u32) -> Option<(usize, usize, usize)> {
+        self.trails
+            .get(&handle)
+            .map(|t| (t.signals.len(), t.waits.len(), t.wakes.len()))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn entry(cycle: u64, source: &'static str) -> HandleAuditEntry {
+        HandleAuditEntry { cycle, tid: 1, lr: 0x8200_0000, source, aux: 0 }
+    }
+
+    #[test]
+    fn disabled_audit_is_a_noop() {
+        let mut a = HandleAudit::default();
+        a.record_create(0x1000, "Event/Auto", entry(0, "NtCreateEvent"));
+        a.record_signal(0x1000, entry(1, "NtSetEvent"));
+        assert!(a.trails.is_empty());
+    }
+
+    #[test]
+    fn enabled_records_create_and_events() {
+        let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
+        a.record_create(0x1014, "Event/Auto", entry(0, "NtCreateEvent"));
+        a.record_signal(0x1014, entry(10, "NtSetEvent"));
+        a.record_wait(0x1014, entry(5, "NtWaitForSingleObjectEx"));
+        a.record_wake(0x1014, entry(11, "wake_eligible_waiters"));
+
+        let counts = a.counts(0x1014).unwrap();
+        assert_eq!(counts, (1, 1, 1));
+    }
+
+    #[test]
+    fn signal_for_unknown_handle_is_dropped() {
+        let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
+        // No `record_create` first → handle has no trail.
+        a.record_signal(0x9999, entry(1, "NtSetEvent"));
+        assert!(a.trails.is_empty());
+    }
+
+    #[test]
+    fn ring_is_bounded_to_capacity() {
+        let mut a = HandleAudit { enabled: true, ..HandleAudit::default() };
+        a.record_create(0x10FC, "Event/Auto", entry(0, "NtCreateEvent"));
+        for i in 0..(AUDIT_RING_CAPACITY * 3) as u64 {
+            a.record_signal(0x10FC, entry(i, "NtSetEvent"));
+        }
+        let trail = &a.trails[&0x10FC];
+        assert_eq!(trail.signals.len(), AUDIT_RING_CAPACITY);
+        // Oldest should have been dropped — the first remaining entry is at
+        // cycle = 2 * AUDIT_RING_CAPACITY (i.e. 64 if capacity = 32).
+        let first = trail.signals.front().unwrap();
+        assert_eq!(first.cycle, (AUDIT_RING_CAPACITY * 2) as u64);
+    }
+
+    #[test]
+    fn unknown_handle_counts_returns_none() {
+        let a = HandleAudit::default();
+        assert!(a.counts(0x10FC).is_none());
+    }
+}
diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs
index 15a8b5e..1e570a3 100644
--- a/crates/xenia-kernel/src/exports.rs
+++ b/crates/xenia-kernel/src/exports.rs
@@ -2,10 +2,15 @@
 //! Each export mirrors a function from xboxkrnl_table.inc.
 
 use crate::objects::KernelObject;
-use crate::state::{KernelState, ModuleId};
-use xenia_cpu::PpcContext;
+use crate::state::{GuestMemoryPcr, KernelState, ModuleId};
+use crate::thread::allocate_thread_image;
+use xenia_cpu::scheduler::{BlockReason, SpawnParams};
+use xenia_cpu::{PpcContext, ThreadRef};
 use xenia_memory::{GuestMemory, MemoryAccess};
 
+// NTSTATUS constants used by wait/sync paths.
+const STATUS_TIMEOUT: u64 = 0x0000_0102;
+
 pub fn register_exports(state: &mut KernelState) {
     use ModuleId::Xboxkrnl;
 
@@ -29,7 +34,7 @@ pub fn register_exports(state: &mut KernelState) {
     state.register_export(Xboxkrnl, 0x4D, "KeAcquireSpinLockAtRaisedIrql", stub_return_zero);
     state.register_export(Xboxkrnl, 0x52, "KeBugCheck", ke_bug_check);
     state.register_export(Xboxkrnl, 0x53, "KeBugCheckEx", ke_bug_check_ex);
-    state.register_export(Xboxkrnl, 0x5A, "KeDelayExecutionThread", stub_success);
+    state.register_export(Xboxkrnl, 0x5A, "KeDelayExecutionThread", ke_delay_execution_thread);
     state.register_export(Xboxkrnl, 0x5D, "KeEnableFpuExceptions", stub_success);
     state.register_export(Xboxkrnl, 0x5F, "KeEnterCriticalRegion", stub_success);
     state.register_export(Xboxkrnl, 0x66, "KeGetCurrentProcessType", ke_get_current_process_type);
@@ -37,21 +42,24 @@ pub fn register_exports(state: &mut KernelState) {
     state.register_export(Xboxkrnl, 0x6C, "KeUnlockL2", stub_success);
     state.register_export(Xboxkrnl, 0x74, "KeInitializeSemaphore", ke_initialize_semaphore);
     state.register_export(Xboxkrnl, 0x7D, "KeLeaveCriticalRegion", stub_success);
-    state.register_export(Xboxkrnl, 0x81, "KeQueryBasePriorityThread", stub_return_zero);
+    state.register_export(Xboxkrnl, 0x7F, "KePulseEvent", ke_pulse_event);
+    state.register_export(Xboxkrnl, 0x81, "KeQueryBasePriorityThread", ke_query_base_priority_thread);
+    state.register_export(Xboxkrnl, 0x82, "KeQueryIdealProcessor", ke_query_ideal_processor);
     state.register_export(Xboxkrnl, 0x83, "KeQueryPerformanceFrequency", ke_query_performance_frequency);
     state.register_export(Xboxkrnl, 0x84, "KeQuerySystemTime", ke_query_system_time);
     state.register_export(Xboxkrnl, 0x85, "KeRaiseIrqlToDpcLevel", stub_return_zero);
-    state.register_export(Xboxkrnl, 0x88, "KeReleaseSemaphore", stub_return_zero);
+    state.register_export(Xboxkrnl, 0x88, "KeReleaseSemaphore", ke_release_semaphore);
     state.register_export(Xboxkrnl, 0x89, "KeReleaseSpinLockFromRaisedIrql", stub_success);
-    state.register_export(Xboxkrnl, 0x8F, "KeResetEvent", stub_return_zero);
-    state.register_export(Xboxkrnl, 0x92, "KeResumeThread", stub_return_zero);
-    state.register_export(Xboxkrnl, 0x97, "KeSetAffinityThread", stub_return_zero);
-    state.register_export(Xboxkrnl, 0x99, "KeSetBasePriorityThread", stub_return_zero);
+    state.register_export(Xboxkrnl, 0x8F, "KeResetEvent", ke_reset_event);
+    state.register_export(Xboxkrnl, 0x92, "KeResumeThread", ke_resume_thread);
+    state.register_export(Xboxkrnl, 0x97, "KeSetAffinityThread", ke_set_affinity_thread);
+    state.register_export(Xboxkrnl, 0x98, "KeSetIdealProcessor", ke_set_ideal_processor);
+    state.register_export(Xboxkrnl, 0x99, "KeSetBasePriorityThread", ke_set_base_priority_thread);
     state.register_export(Xboxkrnl, 0x9B, "KeSetCurrentStackPointers", stub_success);
-    state.register_export(Xboxkrnl, 0x9D, "KeSetEvent", stub_return_zero);
+    state.register_export(Xboxkrnl, 0x9D, "KeSetEvent", ke_set_event);
     state.register_export(Xboxkrnl, 0xAE, "KeTryToAcquireSpinLockAtRaisedIrql", ke_try_acquire_spinlock);
-    state.register_export(Xboxkrnl, 0xAF, "KeWaitForMultipleObjects", stub_success);
-    state.register_export(Xboxkrnl, 0xB0, "KeWaitForSingleObject", stub_success);
+    state.register_export(Xboxkrnl, 0xAF, "KeWaitForMultipleObjects", ke_wait_for_multiple_objects);
+    state.register_export(Xboxkrnl, 0xB0, "KeWaitForSingleObject", ke_wait_for_single_object);
     state.register_export(Xboxkrnl, 0xB1, "KfAcquireSpinLock", stub_return_zero);
     state.register_export(Xboxkrnl, 0xB3, "KfLowerIrql", stub_success);
     state.register_export(Xboxkrnl, 0xB4, "KfReleaseSpinLock", stub_success);
@@ -72,34 +80,42 @@ pub fn register_exports(state: &mut KernelState) {
 
     // Nt*
     state.register_export(Xboxkrnl, 0xCC, "NtAllocateVirtualMemory", nt_allocate_virtual_memory);
-    state.register_export(Xboxkrnl, 0xCD, "NtCancelTimer", stub_success);
-    state.register_export(Xboxkrnl, 0xCE, "NtClearEvent", stub_success);
+    state.register_export(Xboxkrnl, 0xCD, "NtCancelTimer", nt_cancel_timer);
+    state.register_export(Xboxkrnl, 0xCE, "NtClearEvent", nt_clear_event);
     state.register_export(Xboxkrnl, 0xCF, "NtClose", nt_close);
     state.register_export(Xboxkrnl, 0xD1, "NtCreateEvent", nt_create_event);
     state.register_export(Xboxkrnl, 0xD2, "NtCreateFile", nt_create_file);
     state.register_export(Xboxkrnl, 0xD5, "NtCreateSemaphore", nt_create_semaphore);
     state.register_export(Xboxkrnl, 0xD7, "NtCreateTimer", nt_create_timer);
     state.register_export(Xboxkrnl, 0xD9, "NtDeviceIoControlFile", stub_success);
-    state.register_export(Xboxkrnl, 0xDA, "NtDuplicateObject", stub_success);
+    state.register_export(Xboxkrnl, 0xDA, "NtDuplicateObject", nt_duplicate_object);
     state.register_export(Xboxkrnl, 0xDB, "NtFlushBuffersFile", stub_success);
     state.register_export(Xboxkrnl, 0xDC, "NtFreeVirtualMemory", stub_success);
     state.register_export(Xboxkrnl, 0xDF, "NtOpenFile", nt_open_file);
+    state.register_export(Xboxkrnl, 0xE2, "NtPulseEvent", nt_pulse_event);
     state.register_export(Xboxkrnl, 0xE4, "NtQueryDirectoryFile", nt_query_directory_file);
     state.register_export(Xboxkrnl, 0xE7, "NtQueryFullAttributesFile", nt_query_full_attributes_file);
-    state.register_export(Xboxkrnl, 0xE8, "NtQueryInformationFile", stub_success);
+    state.register_export(Xboxkrnl, 0xE8, "NtQueryInformationFile", nt_query_information_file);
     state.register_export(Xboxkrnl, 0xEE, "NtQueryVirtualMemory", stub_success);
-    state.register_export(Xboxkrnl, 0xEF, "NtQueryVolumeInformationFile", stub_success);
+    state.register_export(Xboxkrnl, 0xEF, "NtQueryVolumeInformationFile", nt_query_volume_information_file);
     state.register_export(Xboxkrnl, 0xF0, "NtReadFile", nt_read_file);
-    state.register_export(Xboxkrnl, 0xF3, "NtReleaseSemaphore", stub_return_zero);
-    state.register_export(Xboxkrnl, 0xF5, "NtResumeThread", stub_return_zero);
-    state.register_export(Xboxkrnl, 0xF6, "NtSetEvent", stub_success);
-    state.register_export(Xboxkrnl, 0xF7, "NtSetInformationFile", stub_success);
-    state.register_export(Xboxkrnl, 0xFA, "NtSetTimerEx", stub_success);
-    state.register_export(Xboxkrnl, 0xFC, "NtSuspendThread", stub_return_zero);
-    state.register_export(Xboxkrnl, 0xFD, "NtWaitForSingleObjectEx", stub_success);
-    state.register_export(Xboxkrnl, 0xFE, "NtWaitForMultipleObjectsEx", stub_success);
+    state.register_export(Xboxkrnl, 0xF3, "NtReleaseSemaphore", nt_release_semaphore);
+    state.register_export(Xboxkrnl, 0xF5, "NtResumeThread", nt_resume_thread);
+    state.register_export(Xboxkrnl, 0xF6, "NtSetEvent", nt_set_event);
+    state.register_export(Xboxkrnl, 0xF7, "NtSetInformationFile", nt_set_information_file);
+    state.register_export(Xboxkrnl, 0xFA, "NtSetTimerEx", nt_set_timer_ex);
+    // NOTE: `NtSetInformationThread` is NOT in xboxkrnl_table.inc on
+    // Xbox 360 — canary confirms ordinal 0xFB is
+    // `NtSignalAndWaitForSingleObjectEx`. The prior registration at 0xFB
+    // was silently overwritten by the registration below; the
+    // `nt_set_information_thread` body is retained for the direct-call
+    // unit test but no longer exposed as an ordinal.
+    state.register_export(Xboxkrnl, 0xFC, "NtSuspendThread", nt_suspend_thread);
+    state.register_export(Xboxkrnl, 0xFB, "NtSignalAndWaitForSingleObjectEx", nt_signal_and_wait_for_single_object_ex);
+    state.register_export(Xboxkrnl, 0xFD, "NtWaitForSingleObjectEx", nt_wait_for_single_object_ex);
+    state.register_export(Xboxkrnl, 0xFE, "NtWaitForMultipleObjectsEx", nt_wait_for_multiple_objects_ex);
     state.register_export(Xboxkrnl, 0xFF, "NtWriteFile", nt_write_file);
-    state.register_export(Xboxkrnl, 0x0101, "NtYieldExecution", stub_success);
+    state.register_export(Xboxkrnl, 0x0101, "NtYieldExecution", nt_yield_execution);
 
     // Object
     state.register_export(Xboxkrnl, 0x0103, "ObCreateSymbolicLink", stub_success);
@@ -107,7 +123,7 @@ pub fn register_exports(state: &mut KernelState) {
     state.register_export(Xboxkrnl, 0x0105, "ObDereferenceObject", stub_success);
     state.register_export(Xboxkrnl, 0x010B, "ObLookupThreadByThreadId", stub_success);
     state.register_export(Xboxkrnl, 0x010E, "ObOpenObjectByPointer", stub_success);
-    state.register_export(Xboxkrnl, 0x0110, "ObReferenceObjectByHandle", stub_success);
+    state.register_export(Xboxkrnl, 0x0110, "ObReferenceObjectByHandle", ob_reference_object_by_handle);
 
     // RTL
     state.register_export(Xboxkrnl, 0x0119, "RtlCaptureContext", rtl_capture_context);
@@ -140,12 +156,12 @@ pub fn register_exports(state: &mut KernelState) {
     // Video
     state.register_export(Xboxkrnl, 0x01B1, "VdCallGraphicsNotificationRoutines", stub_success);
     state.register_export(Xboxkrnl, 0x01B4, "VdEnableDisableClockGating", stub_success);
-    state.register_export(Xboxkrnl, 0x01B6, "VdEnableRingBufferRPtrWriteBack", stub_success);
-    state.register_export(Xboxkrnl, 0x01B9, "VdGetCurrentDisplayGamma", stub_return_zero);
+    state.register_export(Xboxkrnl, 0x01B6, "VdEnableRingBufferRPtrWriteBack", vd_enable_ring_buffer_rptr_writeback);
+    state.register_export(Xboxkrnl, 0x01B9, "VdGetCurrentDisplayGamma", vd_get_current_display_gamma);
     state.register_export(Xboxkrnl, 0x01BA, "VdGetCurrentDisplayInformation", stub_success);
     state.register_export(Xboxkrnl, 0x01BD, "VdGetSystemCommandBuffer", vd_get_system_command_buffer);
     state.register_export(Xboxkrnl, 0x01C2, "VdInitializeEngines", stub_success);
-    state.register_export(Xboxkrnl, 0x01C3, "VdInitializeRingBuffer", stub_success);
+    state.register_export(Xboxkrnl, 0x01C3, "VdInitializeRingBuffer", vd_initialize_ring_buffer);
     state.register_export(Xboxkrnl, 0x01C5, "VdInitializeScalerCommandBuffer", stub_success);
     state.register_export(Xboxkrnl, 0x01C6, "VdIsHSIOTrainingSucceeded", vd_is_hsio_training_succeeded);
     state.register_export(Xboxkrnl, 0x01C7, "VdPersistDisplay", stub_success);
@@ -154,7 +170,7 @@ pub fn register_exports(state: &mut KernelState) {
     state.register_export(Xboxkrnl, 0x0269, "VdRetrainEDRAM", stub_success);
     state.register_export(Xboxkrnl, 0x026A, "VdRetrainEDRAMWorker", stub_success);
     state.register_export(Xboxkrnl, 0x01D3, "VdSetDisplayMode", stub_success);
-    state.register_export(Xboxkrnl, 0x01D5, "VdSetGraphicsInterruptCallback", stub_success);
+    state.register_export(Xboxkrnl, 0x01D5, "VdSetGraphicsInterruptCallback", vd_set_graphics_interrupt_callback);
     state.register_export(Xboxkrnl, 0x01D9, "VdSetSystemCommandBufferGpuIdentifierAddress", stub_success);
     state.register_export(Xboxkrnl, 0x01DC, "VdShutdownEngines", stub_success);
     state.register_export(Xboxkrnl, 0x025B, "VdSwap", vd_swap);
@@ -175,7 +191,7 @@ pub fn register_exports(state: &mut KernelState) {
 
     // Xex module
     state.register_export(Xboxkrnl, 0x0194, "XexCheckExecutablePrivilege", stub_return_zero);
-    state.register_export(Xboxkrnl, 0x0195, "XexGetModuleHandle", stub_return_zero);
+    state.register_export(Xboxkrnl, 0x0195, "XexGetModuleHandle", xex_get_module_handle);
     state.register_export(Xboxkrnl, 0x0197, "XexGetProcedureAddress", xex_get_procedure_address);
 
     // Exception handling
@@ -184,21 +200,21 @@ pub fn register_exports(state: &mut KernelState) {
 
 // ===== Generic stubs =====
 
-fn stub_success(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn stub_success(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0; // STATUS_SUCCESS
 }
 
-fn stub_return_zero(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn stub_return_zero(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0;
 }
 
 // ===== Debug =====
 
-fn dbg_break_point(_ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn dbg_break_point(_ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::warn!("DbgBreakPoint hit");
 }
 
-fn dbg_print(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn dbg_print(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     let str_ptr = ctx.gpr[3] as u32;
     if str_ptr != 0 {
         let s = read_cstring(mem, str_ptr);
@@ -209,58 +225,268 @@ fn dbg_print(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelSta
 
 // ===== Threading =====
 
-fn ex_create_thread(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
-    // r3 = handle_ptr, r4 = stack_size, r5 = thread_id_ptr, r6 = xapi_startup
-    // r7 = start_address, r8 = start_context, r9 = creation_flags
+/// `ExCreateThread(handle_ptr, stack_size, thread_id_ptr, xapi_startup,
+///                 start_address, start_context, creation_flags)` —
+/// signature per xenia-canary's xboxkrnl_threading.cc. Creation flags bit 0 =
+/// CREATE_SUSPENDED; top 8 bits encode the affinity mask (logged, not
+/// enforced under Model B with 1-instr quantum).
+fn ex_create_thread(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     let handle_ptr = ctx.gpr[3] as u32;
+    let stack_size = ctx.gpr[4] as u32;
     let thread_id_ptr = ctx.gpr[5] as u32;
+    let start_address = ctx.gpr[7] as u32;
+    let start_context = ctx.gpr[8] as u32;
+    let creation_flags = ctx.gpr[9] as u32;
 
-    let tid = state.next_thread_id;
-    state.next_thread_id += 1;
-    let handle = state.alloc_handle_for(KernelObject::Thread { id: tid });
+    let create_suspended = (creation_flags & 0x1) != 0;
+    let affinity = (creation_flags >> 24) & 0xFF;
 
-    if handle_ptr != 0 {
-        mem.write_u32(handle_ptr, handle);
+    let Some(image) = allocate_thread_image(state, mem, stack_size, 0) else {
+        tracing::error!("ExCreateThread: failed to allocate thread image");
+        ctx.gpr[3] = 0xC000_009A; // STATUS_INSUFFICIENT_RESOURCES
+        return;
+    };
+
+    use std::sync::atomic::Ordering;
+    let tid = state.next_thread_id.fetch_add(1, Ordering::Relaxed);
+    let handle = state.alloc_handle_for(KernelObject::Thread {
+        id: tid,
+        hw_id: None,
+        exit_code: None,
+        waiters: Vec::new(),
+    });
+
+    let tls_slot_count = state.next_tls_index.load(Ordering::Relaxed);
+    let params = SpawnParams {
+        entry: start_address,
+        start_context,
+        stack_base: image.stack_base,
+        stack_size: image.stack_size,
+        pcr_base: image.pcr_base,
+        tls_base: image.tls_base,
+        thread_handle: handle,
+        guest_tid: tid,
+        create_suspended,
+        is_initial: false,
+        tls_slot_count,
+        affinity_mask: affinity as u8,
+        priority: 0,
+        ideal_processor: None,
+    };
+    let result = state.scheduler.spawn(params, &mut GuestMemoryPcr(mem));
+    match result {
+        Ok(hw_id) => {
+            metrics::counter!("scheduler.spawn.ok").increment(1);
+            if let Some(KernelObject::Thread { hw_id: slot, .. }) = state.objects.get_mut(&handle) {
+                *slot = Some(hw_id);
+            }
+            if handle_ptr != 0 {
+                mem.write_u32(handle_ptr, handle);
+            }
+            if thread_id_ptr != 0 {
+                mem.write_u32(thread_id_ptr, tid);
+            }
+            tracing::info!(
+                "ExCreateThread: tid={} handle={:#x} hw={} entry={:#010x} start_ctx={:#010x} suspended={} aff={:#04x}",
+                tid,
+                handle,
+                hw_id,
+                start_address,
+                start_context,
+                create_suspended,
+                affinity,
+            );
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        Err(_) => {
+            metrics::counter!("scheduler.spawn.rejected").increment(1);
+            tracing::error!("ExCreateThread: no free HW thread slot");
+            ctx.gpr[3] = 0xC000_009A;
+        }
     }
-    if thread_id_ptr != 0 {
-        mem.write_u32(thread_id_ptr, tid);
-    }
-    tracing::info!("ExCreateThread: handle={:#x} tid={}", handle, tid);
-    ctx.gpr[3] = 0; // STATUS_SUCCESS
 }
 
-fn ex_terminate_thread(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    tracing::info!("ExTerminateThread: exit_status={:#x}", ctx.gpr[3]);
+/// `ExTerminateThread(exit_code)` — terminates the current guest thread. The
+/// thread transitions to Exited and the main loop unschedules it. Joiners
+/// waiting on the thread handle are woken with STATUS_SUCCESS.
+fn ex_terminate_thread(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
+    let exit_code = ctx.gpr[3] as u32;
+    let (hw_id, tid, handle_opt) = state.scheduler.exit_current(exit_code);
+    tracing::info!(
+        "ExTerminateThread: tid={:?} hw={} exit_code={}",
+        tid,
+        hw_id,
+        exit_code
+    );
+    if let Some(handle) = handle_opt
+        && let Some(KernelObject::Thread {
+            exit_code: ec,
+            waiters,
+            ..
+        }) = state.objects.get_mut(&handle)
+        {
+            *ec = Some(exit_code);
+            let to_wake: Vec<ThreadRef> = std::mem::take(waiters);
+            for w in to_wake {
+                state.scheduler.wake_ref(w);
+            }
+        }
+    tracing::debug!("ExTerminateThread: exit_status={:#x}", ctx.gpr[3]);
     ctx.gpr[3] = 0;
 }
 
-fn hal_return_to_firmware(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn hal_return_to_firmware(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::warn!("HalReturnToFirmware: reason={:#x}", ctx.gpr[3]);
     ctx.gpr[3] = 0;
 }
 
 // ===== Ke* =====
 
-fn ke_bug_check(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+/// `KeSetBasePriorityThread(thread_handle, priority) -> i32 old_priority` —
+/// Axis 1 wiring. Sylpheed calls this from its worker-init prologue on
+/// newly-created threads to bump them to time-critical / high. Storing the
+/// value on the `GuestThread` makes `HwSlot::pick_runnable` honor it.
+fn ke_set_base_priority_thread(
+    ctx: &mut PpcContext,
+    _mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let new_pri = ctx.gpr[4] as i32;
+    let prev = state
+        .scheduler
+        .find_by_handle(handle)
+        .map(|r| state.scheduler.set_priority_ref(r, new_pri))
+        .unwrap_or(0);
+    ctx.gpr[3] = prev as u32 as u64;
+}
+
+fn ke_query_base_priority_thread(
+    ctx: &mut PpcContext,
+    _mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let pri = state
+        .scheduler
+        .find_by_handle(handle)
+        .map(|r| state.scheduler.priority_ref(r))
+        .unwrap_or(0);
+    ctx.gpr[3] = pri as u32 as u64;
+}
+
+/// `KeSetIdealProcessor(thread_handle, proc_number) -> u8 old_ideal` —
+/// Axis 5. Stores the hint on the `GuestThread` for future spawn-sibling
+/// placement; does NOT migrate a live thread (use `KeSetAffinityThread`
+/// for that).
+fn ke_set_ideal_processor(
+    ctx: &mut PpcContext,
+    _mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let ideal = ctx.gpr[4] as u8;
+    let prev = state
+        .scheduler
+        .find_by_handle(handle)
+        .map(|r| state.scheduler.set_ideal_ref(r, ideal))
+        .unwrap_or(0xFF);
+    ctx.gpr[3] = prev as u64;
+}
+
+fn ke_query_ideal_processor(
+    ctx: &mut PpcContext,
+    _mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let ideal = state
+        .scheduler
+        .find_by_handle(handle)
+        .and_then(|r| state.scheduler.ideal_ref(r))
+        .unwrap_or(0);
+    ctx.gpr[3] = ideal as u64;
+}
+
+/// `NtSetInformationThread(handle, info_class, info_ptr, info_len)` —
+/// minimal Axis 5 wiring for priority / affinity / ideal-processor
+/// classes. Other classes return `STATUS_INVALID_INFO_CLASS`.
+///
+/// Not registered as an ordinal: Xbox 360's `xboxkrnl.exe` doesn't export
+/// this function — canary's table assigns `0xFB` to
+/// `NtSignalAndWaitForSingleObjectEx`. The body is retained only for the
+/// direct-call unit test below.
+#[allow(dead_code)]
+fn nt_set_information_thread(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    const STATUS_INVALID_INFO_CLASS: u64 = 0xC000_0003;
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let info_class = ctx.gpr[4] as u32;
+    let info_ptr = ctx.gpr[5] as u32;
+    let info_len = ctx.gpr[6] as u32;
+    let Some(r) = state.scheduler.find_by_handle(handle) else {
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        return;
+    };
+    match info_class {
+        2 /* ThreadPriority */ if info_len >= 4 => {
+            let pri = mem.read_u32(info_ptr) as i32;
+            state.scheduler.set_priority_ref(r, pri);
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        3 /* ThreadAffinityMask */ if info_len >= 4 => {
+            let mask = mem.read_u32(info_ptr) as u8;
+            state.set_affinity(handle, mask, mem);
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        13 /* ThreadIdealProcessor */ if info_len >= 4 => {
+            let ideal = mem.read_u32(info_ptr) as u8;
+            state.scheduler.set_ideal_ref(r, ideal);
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        _ => {
+            ctx.gpr[3] = STATUS_INVALID_INFO_CLASS;
+        }
+    }
+}
+
+/// `KeSetAffinityThread(thread_handle, new_mask) -> old_mask` — Axis 4.
+/// Drives `KernelState::set_affinity` which delegates to the scheduler
+/// and then fixes up every outstanding `ThreadRef` held in waiter lists.
+fn ke_set_affinity_thread(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let new_mask = (ctx.gpr[4] as u32) as u8;
+    let old = state.set_affinity(handle, new_mask, mem);
+    ctx.gpr[3] = old as u64;
+}
+
+fn ke_bug_check(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::error!("KeBugCheck: code={:#x}", ctx.gpr[3]);
     ctx.gpr[3] = 0;
 }
 
-fn ke_bug_check_ex(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn ke_bug_check_ex(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::error!("KeBugCheckEx: code={:#x} p1={:#x} p2={:#x} p3={:#x}",
         ctx.gpr[3], ctx.gpr[4], ctx.gpr[5], ctx.gpr[6]);
     ctx.gpr[3] = 0;
 }
 
-fn ke_get_current_process_type(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn ke_get_current_process_type(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 1; // PROC_USER
 }
 
-fn ke_query_performance_frequency(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn ke_query_performance_frequency(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 50_000_000; // 50 MHz
 }
 
-fn ke_query_system_time(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn ke_query_system_time(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     let time_ptr = ctx.gpr[3] as u32;
     if time_ptr != 0 {
         let fake_time: u64 = 132_500_000_000_000_000; // ~2021 FILETIME
@@ -269,44 +495,60 @@ fn ke_query_system_time(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mu
     }
 }
 
-fn ke_initialize_semaphore(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
-    // r3 = semaphore_ptr, r4 = count, r5 = limit
+fn ke_initialize_semaphore(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
+    // r3 = PKSEMAPHORE, r4 = initial count, r5 = limit.
+    // Mirrors xenia-canary KeInitializeSemaphore_entry
+    // (xboxkrnl_threading.cc:692). `ensure_dispatcher_object` (below)
+    // reads type@+0, signal_state@+4, and limit@+0x10 to mint the
+    // kernel-side shadow on first wait/release — so dropping the count
+    // and limit args (the prior zero-fill) silently produced
+    // `Semaphore { count: 0, max: 1 }` regardless of caller intent.
     let sem_ptr = ctx.gpr[3] as u32;
-    if sem_ptr != 0 {
-        // Zero-init the KSEMAPHORE structure (0x14 bytes)
-        for i in (0..0x14).step_by(4) {
-            mem.write_u32(sem_ptr + i, 0);
-        }
+    let count = ctx.gpr[4] as u32;
+    let limit = ctx.gpr[5] as u32;
+    if sem_ptr == 0 {
+        return;
     }
+    // DISPATCHER_HEADER: type=5 (Semaphore), absolute=0, size=5 u32s,
+    // inserted=0, signal_state=count, then 8-byte wait_list_head, then
+    // limit at +0x10.
+    mem.write_u8(sem_ptr, 5);
+    mem.write_u8(sem_ptr + 0x01, 0);
+    mem.write_u8(sem_ptr + 0x02, 5);
+    mem.write_u8(sem_ptr + 0x03, 0);
+    mem.write_u32(sem_ptr + 0x04, count);
+    mem.write_u32(sem_ptr + 0x08, 0);
+    mem.write_u32(sem_ptr + 0x0C, 0);
+    mem.write_u32(sem_ptr + 0x10, limit);
 }
 
-fn ke_try_acquire_spinlock(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn ke_try_acquire_spinlock(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 1; // TRUE (acquired successfully in single-threaded mode)
 }
 
-fn ke_tls_alloc(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn ke_tls_alloc(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
     ctx.gpr[3] = state.tls_alloc() as u64;
 }
 
-fn ke_tls_get_value(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn ke_tls_get_value(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
     let index = ctx.gpr[3] as u32;
     ctx.gpr[3] = state.tls_get(index);
 }
 
-fn ke_tls_set_value(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn ke_tls_set_value(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
     let index = ctx.gpr[3] as u32;
     let value = ctx.gpr[4];
     state.tls_set(index, value);
     ctx.gpr[3] = 1; // TRUE
 }
 
-fn ex_get_xconfig_setting(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn ex_get_xconfig_setting(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0; // STATUS_SUCCESS (writes nothing)
 }
 
 // ===== Memory =====
 
-fn nt_allocate_virtual_memory(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
+fn nt_allocate_virtual_memory(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     // r3 = base_addr_ptr (in/out), r4 = region_size_ptr (in/out)
     // r5 = alloc_type, r6 = protect
     let base_ptr = ctx.gpr[3] as u32;
@@ -349,38 +591,72 @@ fn nt_allocate_virtual_memory(ctx: &mut PpcContext, mem: &mut GuestMemory, state
     ctx.gpr[3] = 0; // STATUS_SUCCESS
 }
 
-fn mm_allocate_physical_memory_ex(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
-    // r3 = size, r4 = protect, r5 = min_addr, r6 = max_addr, r7 = alignment
-    let size = ctx.gpr[3] as u32;
+fn mm_allocate_physical_memory_ex(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // Matches xenia-canary `MmAllocatePhysicalMemoryEx_entry` — see
+    // `xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc:489-494`.
+    //   r3 = flags, r4 = region_size, r5 = protect_bits,
+    //   r6 = min_addr_range, r7 = max_addr_range, r8 = alignment
+    // Return value is the guest address; 0 indicates failure (Xbox ABI).
+    let flags = ctx.gpr[3] as u32;
+    let size = ctx.gpr[4] as u32;
+    if size == 0 {
+        tracing::warn!(flags, "MmAllocatePhysicalMemoryEx: zero-size request → returning 0");
+        ctx.gpr[3] = 0;
+        return;
+    }
     match state.heap_alloc(size, mem) {
-        Some(addr) => ctx.gpr[3] = addr as u64,
-        None => ctx.gpr[3] = 0,
+        Some(addr) => {
+            tracing::debug!(
+                flags,
+                size = format_args!("{size:#x}"),
+                addr = format_args!("{addr:#010x}"),
+                "MmAllocatePhysicalMemoryEx"
+            );
+            ctx.gpr[3] = addr as u64;
+        }
+        None => {
+            tracing::warn!(
+                flags,
+                size = format_args!("{size:#x}"),
+                "MmAllocatePhysicalMemoryEx: heap exhausted"
+            );
+            ctx.gpr[3] = 0;
+        }
     }
 }
 
-fn mm_create_kernel_stack(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
-    // r3 = stack_size, r4 = reserved
-    let size = std::cmp::max(ctx.gpr[3] as u32, 0x4000); // Min 16KB
+fn mm_create_kernel_stack(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // xenia-canary `MmCreateKernelStack_entry(stack_size, r4)`; returns stack top.
+    // `xboxkrnl_threading.cc` — see DECLARE_XBOXKRNL_EXPORT on MmCreateKernelStack.
+    let requested = ctx.gpr[3] as u32;
+    let size = std::cmp::max(requested, 0x4000); // Min 16KB per canary
     match state.stack_alloc(size, mem) {
         Some(top) => {
-            tracing::info!("MmCreateKernelStack: top={:#010x} size={:#x}", top, size);
+            tracing::info!(
+                top = format_args!("{top:#010x}"),
+                size = format_args!("{size:#x}"),
+                "MmCreateKernelStack"
+            );
             ctx.gpr[3] = top as u64;
         }
-        None => ctx.gpr[3] = 0,
+        None => {
+            tracing::warn!(size = format_args!("{size:#x}"), "MmCreateKernelStack: stack heap exhausted");
+            ctx.gpr[3] = 0;
+        }
     }
 }
 
-fn mm_get_physical_address(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn mm_get_physical_address(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = virtual address -> return physical address
-    ctx.gpr[3] = ctx.gpr[3] & 0x1FFF_FFFF; // Mask to 512MB physical
+    ctx.gpr[3] &= 0x1FFF_FFFF; // Mask to 512MB physical
 }
 
-fn mm_query_address_protect(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn mm_query_address_protect(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     // Return PAGE_READWRITE (0x04)
     ctx.gpr[3] = 0x04;
 }
 
-fn mm_query_statistics(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn mm_query_statistics(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = stats_ptr — write fake memory statistics
     let ptr = ctx.gpr[3] as u32;
     if ptr != 0 {
@@ -393,125 +669,1097 @@ fn mm_query_statistics(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut
 
 // ===== File I/O =====
 
-fn nt_create_file(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
-    let handle = state.alloc_handle_for(KernelObject::File { path: String::new() });
-    tracing::info!("NtCreateFile: handle={:#x}", handle);
-    ctx.gpr[3] = 0;
+/// NT error codes the file handlers need. Keeping them inline avoids pulling
+/// in a whole NTSTATUS module for a single file.
+const STATUS_SUCCESS: u64 = 0x0000_0000;
+const STATUS_END_OF_FILE: u64 = 0xC000_0011;
+const STATUS_INVALID_HANDLE: u64 = 0xC000_0008;
+const STATUS_OBJECT_NAME_NOT_FOUND: u64 = 0xC000_0034;
+const STATUS_NO_MORE_FILES: u64 = 0x8000_0006;
+const STATUS_SEMAPHORE_LIMIT_EXCEEDED: u64 = 0xC000_0047;
+const STATUS_UNSUCCESSFUL: u64 = 0xC000_0001;
+const STATUS_INVALID_INFO_CLASS: u64 = 0xC000_0003;
+const STATUS_INFO_LENGTH_MISMATCH: u64 = 0xC000_0004;
+/// `X_ERROR_NOT_FOUND` from xenia-canary `xenia/xbox.h`. Returned by
+/// `XexGetModuleHandle` for unknown module names.
+const X_ERROR_NOT_FOUND: u64 = 0x0000_048B;
+
+/// A sentinel byte-offset value meaning "read at current file position".
+const FILE_USE_FILE_POINTER_POSITION: u64 = 0xFFFF_FFFF_FFFF_FFFE;
+
+/// Write an `IO_STATUS_BLOCK { status, information }` if the pointer is non-null.
+fn write_io_status_block(mem: &GuestMemory, ptr: u32, status: u32, information: u32) {
+    if ptr == 0 {
+        return;
+    }
+    mem.write_u32(ptr, status);
+    mem.write_u32(ptr + 4, information);
 }
 
-fn nt_open_file(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
-    let handle = state.alloc_handle_for(KernelObject::File { path: String::new() });
-    tracing::info!("NtOpenFile: handle={:#x}", handle);
-    ctx.gpr[3] = 0;
+/// Open a VFS-backed file. Shared between NtCreateFile and NtOpenFile — the
+/// create/open distinction only matters for writable volumes, which the disc
+/// image isn't.
+fn open_vfs_file(
+    mem: &GuestMemory,
+    state: &mut KernelState,
+    handle_out: u32,
+    io_status_block: u32,
+    obj_attrs_ptr: u32,
+) -> u64 {
+    // Accept the empty-after-prefix case (e.g. `NtCreateFile("game:\")`) as
+    // a valid "open the partition/device root" request — Canary's
+    // `NtCreateFile_entry` in xboxkrnl_io.cc:39 lets empty paths through
+    // to the VFS, which resolves them as a directory handle on the root.
+    // Sylpheed opens `game:\` near the end of its boot as a disc-validation
+    // probe; returning `STATUS_OBJECT_NAME_NOT_FOUND` makes the async worker
+    // see a null handle later and trigger `XamShowDirtyDiscErrorUI`.
+    let path = crate::path::object_attributes_to_vfs_path(mem, obj_attrs_ptr)
+        .unwrap_or_default();
+    if path.is_empty() && obj_attrs_ptr == 0 {
+        if handle_out != 0 {
+            mem.write_u32(handle_out, 0);
+        }
+        write_io_status_block(mem, io_status_block, STATUS_OBJECT_NAME_NOT_FOUND as u32, 0);
+        return STATUS_OBJECT_NAME_NOT_FOUND;
+    }
+    if path.is_empty() {
+        // Empty path after prefix strip is the "open the device/partition
+        // root" case (e.g. `NtCreateFile("game:\")`). Canary's
+        // `NtCreateFile_entry` resolves these through the VFS and returns
+        // a directory handle. We don't model directory entries, so synth
+        // a zero-byte "file" whose `path` is empty; `nt_query_information_file`
+        // then reports `Directory=1` / `FILE_ATTRIBUTE_DIRECTORY` based on
+        // the path shape, which is how Sylpheed's disc-validation probe
+        // decides it found a directory and proceeds.
+        let handle = state.alloc_handle_for(KernelObject::File {
+            path: String::new(),
+            size: 0,
+            position: 0,
+            data: std::sync::Arc::new(Vec::new()),
+            dir_enum_pos: None,
+        });
+        if handle_out != 0 {
+            mem.write_u32(handle_out, handle);
+        }
+        write_io_status_block(mem, io_status_block, STATUS_SUCCESS as u32, 0);
+        return STATUS_SUCCESS;
+    }
+
+    let vfs = match state.vfs.as_ref() {
+        Some(v) => v,
+        None => {
+            tracing::warn!("NtCreateFile/NtOpenFile for {:?}: no VFS mounted", path);
+            if handle_out != 0 {
+                mem.write_u32(handle_out, 0);
+            }
+            write_io_status_block(mem, io_status_block, STATUS_OBJECT_NAME_NOT_FOUND as u32, 0);
+            return STATUS_OBJECT_NAME_NOT_FOUND;
+        }
+    };
+
+    match vfs.read_file(&path) {
+        Ok(bytes) => {
+            let size = bytes.len() as u64;
+            let handle = state.alloc_handle_for(KernelObject::File {
+                path: path.clone(),
+                size,
+                position: 0,
+                data: std::sync::Arc::new(bytes),
+                dir_enum_pos: None,
+            });
+            if handle_out != 0 {
+                mem.write_u32(handle_out, handle);
+            }
+            write_io_status_block(mem, io_status_block, STATUS_SUCCESS as u32, 0);
+            tracing::info!("File opened: path={:?} size={} handle={:#x}", path, size, handle);
+            STATUS_SUCCESS
+        }
+        Err(e) => {
+            // When the VFS can't resolve a path we synthesize a zero-byte
+            // virtual file rather than returning NOT_FOUND. Two rationales:
+            //
+            //   1. **Writable system partitions** (`cache:/`, `cache0:`,
+            //      `cache1:`, `partition0:`, `partition1:`) aren't backed by
+            //      the disc — Canary mounts them on host directories
+            //      ([xenia_main.cc:612-651](xenia-canary/src/xenia/app/xenia_main.cc)).
+            //      We skip the host mount for now, so opens there always miss
+            //      without this fallback.
+            //
+            //   2. **Disc files that didn't make it into the ISO rip** (e.g.,
+            //      Sylpheed's `dat/files.tbl`, which the retail disc shipped
+            //      but our dump doesn't contain). Returning NOT_FOUND makes
+            //      Sylpheed's boot validator call `XamShowDirtyDiscErrorUI`
+            //      → dashboard exit; see Canary's `XamShowDirtyDiscErrorUI`
+            //      at xam_ui.cc:562 for the "bad or unimplemented file IO
+            //      calls" framing.
+            //
+            // A zero-byte file lets the game's existence probe succeed, its
+            // read return EOF, and its "is the content here" sanity checks
+            // pass. If the game actually needs the bytes for gameplay we'll
+            // see a fresh failure downstream and can decide what to stub next.
+            let handle = state.alloc_handle_for(KernelObject::File {
+                path: path.clone(),
+                size: 0,
+                position: 0,
+                data: std::sync::Arc::new(Vec::new()),
+                dir_enum_pos: None,
+            });
+            if handle_out != 0 {
+                mem.write_u32(handle_out, handle);
+            }
+            write_io_status_block(mem, io_status_block, STATUS_SUCCESS as u32, 0);
+            tracing::info!(
+                "Synthesized empty file for missing path: path={:?} err={} handle={:#x}",
+                path,
+                e,
+                handle
+            );
+            STATUS_SUCCESS
+        }
+    }
 }
 
-fn nt_read_file(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    ctx.gpr[3] = 0xC000_0011; // STATUS_END_OF_FILE
+fn nt_create_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle_out, r4 = desired_access, r5 = obj_attrs, r6 = io_status_block,
+    // r7 = allocation_size, r8 = file_attributes, r9 = share_access, r10 = create_disposition
+    let handle_out = ctx.gpr[3] as u32;
+    let obj_attrs_ptr = ctx.gpr[5] as u32;
+    let io_status_block = ctx.gpr[6] as u32;
+    ctx.gpr[3] = open_vfs_file(mem, state, handle_out, io_status_block, obj_attrs_ptr);
 }
 
-fn nt_write_file(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    ctx.gpr[3] = 0; // STATUS_SUCCESS (discard data)
+fn nt_open_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle_out, r4 = desired_access, r5 = obj_attrs,
+    // r6 = io_status_block, r7 = share_access, r8 = open_options
+    let handle_out = ctx.gpr[3] as u32;
+    let obj_attrs_ptr = ctx.gpr[5] as u32;
+    let io_status_block = ctx.gpr[6] as u32;
+    ctx.gpr[3] = open_vfs_file(mem, state, handle_out, io_status_block, obj_attrs_ptr);
 }
 
-fn nt_query_full_attributes_file(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    ctx.gpr[3] = 0xC000_0034; // STATUS_OBJECT_NAME_NOT_FOUND
+/// Signal an NT-style completion event on synchronous I/O completion.
+///
+/// `NtReadFile` / `NtWriteFile` take an event handle at r4. The NT contract
+/// is: on a real async driver, the event pulses when the I/O finishes.
+/// Games that use the common "issue I/O then wait on the event" idiom will
+/// deadlock if we return `STATUS_SUCCESS` without signaling — observed on
+/// Sylpheed with four stuck threads parked on `WaitAny { handles: [evt] }`
+/// that nothing else could wake. We finish I/O synchronously so we signal
+/// immediately on *every* completion path (success, EOF, invalid-handle).
+/// No-op when the caller passes a null handle (synchronous-wait style).
+fn signal_io_completion_event(state: &mut KernelState, event_handle: u32) {
+    if event_handle == 0 {
+        return;
+    }
+    let prev = if let Some(KernelObject::Event { signaled, .. }) = state.objects.get_mut(&event_handle) {
+        let was = *signaled;
+        *signaled = true;
+        was as u64
+    } else {
+        0
+    };
+    state.audit_signal(event_handle, 0, "signal_io_completion_event", prev);
+    wake_eligible_waiters(state, event_handle);
 }
 
-fn nt_query_directory_file(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    ctx.gpr[3] = 0xC000_0034; // STATUS_OBJECT_NAME_NOT_FOUND
-}
-
-fn nt_close(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn nt_read_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = event, r5 = apc_routine, r6 = apc_ctx,
+    // r7 = io_status_block, r8 = buffer, r9 = length, r10 = byte_offset_ptr
     let handle = ctx.gpr[3] as u32;
-    state.objects.remove(&handle);
+    let event_handle = ctx.gpr[4] as u32;
+    let io_status_block = ctx.gpr[7] as u32;
+    let buffer = ctx.gpr[8] as u32;
+    let length = ctx.gpr[9] as u32;
+    let byte_offset_ptr = ctx.gpr[10] as u32;
+
+    let Some(KernelObject::File { path, size, position, data, .. }) = state.objects.get_mut(&handle) else {
+        tracing::warn!("NtReadFile: invalid handle {:#x}", handle);
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        write_io_status_block(mem, io_status_block, STATUS_INVALID_HANDLE as u32, 0);
+        signal_io_completion_event(state, event_handle);
+        return;
+    };
+
+    // If the caller supplied an explicit byte offset (not 0xFFFFFFFFFFFFFFFE)
+    // seek to it; otherwise continue from the stored cursor.
+    let start_pos = if byte_offset_ptr != 0 {
+        let offset = mem.read_u64(byte_offset_ptr);
+        if offset != FILE_USE_FILE_POINTER_POSITION && offset != u64::MAX {
+            *position = offset;
+        }
+        *position
+    } else {
+        *position
+    };
+
+    let total = *size;
+    if start_pos >= total {
+        write_io_status_block(mem, io_status_block, STATUS_END_OF_FILE as u32, 0);
+        ctx.gpr[3] = STATUS_END_OF_FILE;
+        signal_io_completion_event(state, event_handle);
+        return;
+    }
+
+    let avail = (total - start_pos).min(length as u64) as usize;
+    if avail == 0 {
+        write_io_status_block(mem, io_status_block, STATUS_END_OF_FILE as u32, 0);
+        ctx.gpr[3] = STATUS_END_OF_FILE;
+        signal_io_completion_event(state, event_handle);
+        return;
+    }
+
+    let start = start_pos as usize;
+    let end = start + avail;
+    let slice = &data[start..end];
+    mem.write_bulk(buffer, slice);
+    *position = start_pos + avail as u64;
+
+    tracing::info!(
+        "NtReadFile: {} bytes from {:?} @ {}  (handle={:#x})",
+        avail, path, start_pos, handle,
+    );
+    write_io_status_block(mem, io_status_block, STATUS_SUCCESS as u32, avail as u32);
+    ctx.gpr[3] = STATUS_SUCCESS;
+    signal_io_completion_event(state, event_handle);
+}
+
+fn nt_write_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // We don't back anything writable, so discard. Still report the full
+    // length as written via IO_STATUS_BLOCK so the caller doesn't retry.
+    let event_handle = ctx.gpr[4] as u32;
+    let io_status_block = ctx.gpr[7] as u32;
+    let length = ctx.gpr[9] as u32;
+    write_io_status_block(mem, io_status_block, STATUS_SUCCESS as u32, length);
+    ctx.gpr[3] = STATUS_SUCCESS;
+    signal_io_completion_event(state, event_handle);
+}
+
+/// Minimal `NtQueryInformationFile`. The only classes Sylpheed (and most
+/// games) use are `FileStandardInformation` (5) and `FilePositionInformation`
+/// (14). Anything else gets zeros + success.
+fn nt_query_information_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = io_status_block, r5 = file_info, r6 = length, r7 = class
+    let handle = ctx.gpr[3] as u32;
+    let io_status_block = ctx.gpr[4] as u32;
+    let file_info = ctx.gpr[5] as u32;
+    let length = ctx.gpr[6] as u32;
+    let class = ctx.gpr[7] as u32;
+
+    let Some(KernelObject::File { size, position, path, .. }) = state.objects.get(&handle) else {
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        write_io_status_block(mem, io_status_block, STATUS_INVALID_HANDLE as u32, 0);
+        return;
+    };
+
+    // Root-of-device opens (`game:\`, `cache:\`, `partition0`) strip to
+    // an empty string post-prefix — see `open_vfs_file`'s synth path.
+    // Games query these as directories (DirectoryObject probe), and
+    // reporting `Directory=0` makes Sylpheed treat the open as "found a
+    // non-directory where I expected a directory" and call
+    // `XamShowDirtyDiscErrorUI`. Canary's `NtQueryInformationFile` pulls
+    // the real file-system entry's kind; we key on path shape since we
+    // don't model directory entries.
+    let is_directory = path.is_empty()
+        || path.ends_with('/')
+        || path.ends_with(':');
+    let size = *size;
+    let position = *position;
+
+    // `FILE_ATTRIBUTE_DIRECTORY` (NT / Xbox) — advertised in
+    // `FileNetworkOpenInformation.FileAttributes`; Sylpheed's async-I/O
+    // worker queries with class=34 and the calling code checks this bit
+    // to decide whether the open resolved to a directory before
+    // continuing down the non-error path.
+    const FILE_ATTRIBUTE_DIRECTORY: u32 = 0x10;
+    const FILE_ATTRIBUTE_NORMAL: u32 = 0x80;
+    let written: u32 = match class {
+        // FileStandardInformation: AllocationSize(i64), EndOfFile(i64), NumberOfLinks(u32), DeletePending(u8), Directory(u8), pad(u16)
+        5 if length >= 24 => {
+            mem.write_u64(file_info, size);
+            mem.write_u64(file_info + 8, size);
+            mem.write_u32(file_info + 16, 1);
+            mem.write_u8(file_info + 20, 0);
+            mem.write_u8(file_info + 21, if is_directory { 1 } else { 0 });
+            mem.write_u16(file_info + 22, 0);
+            24
+        }
+        // FilePositionInformation: CurrentByteOffset(i64)
+        14 if length >= 8 => {
+            mem.write_u64(file_info, position);
+            8
+        }
+        // FileNetworkOpenInformation: timestamps(4x i64) @ 0..32,
+        // AllocationSize(i64) @ 32, EndOfFile(i64) @ 40, FileAttributes(u32) @ 48
+        // Sylpheed's async-validation worker asks for this (`length=56`)
+        // and the caller checks `FileAttributes & FILE_ATTRIBUTE_DIRECTORY`
+        // right after. Without populating the attributes the bit is
+        // clear, the caller decides the open "found a non-directory
+        // where a directory was expected", and the outer routine calls
+        // `XamShowDirtyDiscErrorUI` → `XamLoaderLaunchTitle` → garbage.
+        34 if length >= 56 => {
+            // Zero timestamps (we don't track real times).
+            for off in (0..32).step_by(8) {
+                mem.write_u64(file_info + off, 0);
+            }
+            mem.write_u64(file_info + 32, size);
+            mem.write_u64(file_info + 40, size);
+            let attrs = if is_directory {
+                FILE_ATTRIBUTE_DIRECTORY
+            } else {
+                FILE_ATTRIBUTE_NORMAL
+            };
+            mem.write_u32(file_info + 48, attrs);
+            mem.write_u32(file_info + 52, 0); // pad
+            56
+        }
+        _ => {
+            // Zero out whatever the caller asked for — conservative default.
+            for i in 0..length {
+                mem.write_u8(file_info + i, 0);
+            }
+            length
+        }
+    };
+
+    write_io_status_block(mem, io_status_block, STATUS_SUCCESS as u32, written);
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+/// `NtSetInformationFile(FileHandle, IoStatusBlock*, FileInformation,
+/// Length, FileInformationClass)`. Mirrors Canary
+/// [xboxkrnl_io_info.cc:180-304](xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_io_info.cc).
+///
+/// Validates `info_class` (must have a defined minimum size) and
+/// `info_length` (must meet that minimum); returns
+/// `STATUS_INVALID_INFO_CLASS` / `STATUS_INFO_LENGTH_MISMATCH` in those
+/// cases. The only class with real side-effects in xenia-rs is
+/// `XFilePositionInformation` (14) — seek updates the file's cursor.
+/// Read-only VFS means `XFileEndOfFileInformation` (20, truncate) can
+/// only succeed if the new length equals the current size, otherwise
+/// returns `STATUS_UNSUCCESSFUL`. Other classes acknowledge the write
+/// but have no backing store.
+fn nt_set_information_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = io_status_block, r5 = info_ptr,
+    // r6 = info_length, r7 = info_class.
+    let handle = ctx.gpr[3] as u32;
+    let iosb_ptr = ctx.gpr[4] as u32;
+    let info_ptr = ctx.gpr[5] as u32;
+    let info_length = ctx.gpr[6] as u32;
+    let info_class = ctx.gpr[7] as u32;
+
+    // Matches Canary's `GetSetFileInfoMinimumLength`. A return of 0 means
+    // "class we don't recognise for SetInfo" → STATUS_INVALID_INFO_CLASS.
+    let min_length = match info_class {
+        4 => 40,  // XFileBasicInformation (times + attributes)
+        10 => 16, // XFileRenameInformation
+        13 => 4,  // XFileDispositionInformation (delete_file u32)
+        14 => 8,  // XFilePositionInformation (i64 current offset)
+        16 | 31 => 4, // XFileModeInformation / XFileIoPriorityInformation
+        19 | 20 | 23 => 8, // XFileAllocationInformation / EndOfFileInformation / MountPartitionInformation
+        11 => 16, // XFileLinkInformation
+        24 => 152, // XFileMountPartitionsInformation
+        30 => 8,  // XFileCompletionInformation (handle + key, 2 dwords)
+        _ => 0,
+    };
+    if min_length == 0 {
+        ctx.gpr[3] = STATUS_INVALID_INFO_CLASS;
+        return;
+    }
+    if info_length < min_length {
+        ctx.gpr[3] = STATUS_INFO_LENGTH_MISMATCH;
+        return;
+    }
+
+    // Handle lookup.
+    let Some(KernelObject::File { size, position, .. }) = state.objects.get_mut(&handle) else {
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        return;
+    };
+
+    let (status, out_length): (u64, u32) = match info_class {
+        // XFilePositionInformation (14): i64 new byte offset.
+        14 => {
+            let new_offset = mem.read_u64(info_ptr);
+            // Canary clamps nothing — it assigns directly. Game is
+            // responsible for staying within the file; reads past EOF
+            // return STATUS_END_OF_FILE from NtReadFile.
+            *position = new_offset;
+            (STATUS_SUCCESS, 8)
+        }
+        // XFileEndOfFileInformation (20): i64 new length. Read-only VFS
+        // → only a no-op truncate-to-same-size succeeds.
+        20 => {
+            let new_eof = mem.read_u64(info_ptr);
+            if new_eof == *size {
+                (STATUS_SUCCESS, 8)
+            } else {
+                (STATUS_UNSUCCESSFUL, 8)
+            }
+        }
+        // XFileAllocationInformation (19): pre-allocation hint. Canary
+        // explicitly `XELOGW`s and reports out_length=8; we do the same.
+        19 => (STATUS_SUCCESS, 8),
+        // XFileBasicInformation (4): times + attributes. Read-only VFS
+        // can't persist these, but acknowledge the write to match Canary's
+        // behaviour on a read-only entry.
+        4 => (STATUS_SUCCESS, 40),
+        // XFileDispositionInformation (13): delete-on-close. Read-only VFS
+        // → log the bit and succeed; the file is never actually removed.
+        13 => {
+            let delete_flag = mem.read_u32(info_ptr) != 0;
+            tracing::debug!(
+                handle = format_args!("{handle:#x}"),
+                delete = delete_flag,
+                "NtSetInformationFile: disposition (read-only VFS, no-op)"
+            );
+            (STATUS_SUCCESS, 0)
+        }
+        // Other recognised classes: accept and report back the minimum
+        // length so callers don't bail on zero-information.
+        _ => (STATUS_SUCCESS, min_length),
+    };
+
+    if iosb_ptr != 0 {
+        write_io_status_block(mem, iosb_ptr, status as u32, out_length);
+    }
+    ctx.gpr[3] = status;
+}
+
+fn nt_query_full_attributes_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = obj_attrs, r4 = network_open_info
+    let obj_attrs_ptr = ctx.gpr[3] as u32;
+    let out = ctx.gpr[4] as u32;
+
+    let path = match crate::path::object_attributes_to_vfs_path(mem, obj_attrs_ptr) {
+        Some(p) if !p.is_empty() => p,
+        _ => {
+            ctx.gpr[3] = STATUS_OBJECT_NAME_NOT_FOUND;
+            return;
+        }
+    };
+
+    let Some(vfs) = state.vfs.as_ref() else {
+        ctx.gpr[3] = STATUS_OBJECT_NAME_NOT_FOUND;
+        return;
+    };
+
+    match vfs.stat(&path) {
+        Ok(entry) => {
+            // FILE_NETWORK_OPEN_INFORMATION (56 bytes): 4 × FILETIME,
+            // AllocationSize(i64), EndOfFile(i64), FileAttributes(u32), pad(u32)
+            let filetime: u64 = 132_500_000_000_000_000;
+            if out != 0 {
+                mem.write_u32(out, (filetime >> 32) as u32);
+                mem.write_u32(out + 4, filetime as u32);
+                mem.write_u32(out + 8, (filetime >> 32) as u32);
+                mem.write_u32(out + 12, filetime as u32);
+                mem.write_u32(out + 16, (filetime >> 32) as u32);
+                mem.write_u32(out + 20, filetime as u32);
+                mem.write_u32(out + 24, (filetime >> 32) as u32);
+                mem.write_u32(out + 28, filetime as u32);
+                mem.write_u64(out + 32, entry.size);
+                mem.write_u64(out + 40, entry.size);
+                let attrs: u32 = if entry.is_directory { 0x10 } else { 0x80 };
+                mem.write_u32(out + 48, attrs);
+                mem.write_u32(out + 52, 0);
+            }
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        Err(_) => {
+            ctx.gpr[3] = STATUS_OBJECT_NAME_NOT_FOUND;
+        }
+    }
+}
+
+fn nt_query_volume_information_file(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
+    // r3 = handle, r4 = io_status_block, r5 = info, r6 = length, r7 = class
+    let io_status_block = ctx.gpr[4] as u32;
+    let info = ctx.gpr[5] as u32;
+    let length = ctx.gpr[6] as u32;
+    let class = ctx.gpr[7] as u32;
+
+    // FileFsSizeInformation (class 3): 24 bytes
+    //   TotalAllocationUnits(i64), AvailableAllocationUnits(i64),
+    //   SectorsPerAllocationUnit(u32), BytesPerSector(u32)
+    let written: u32 = match class {
+        3 if length >= 24 => {
+            mem.write_u64(info, 0x10_0000); // ~2GB at 2KB sectors
+            mem.write_u64(info + 8, 0);
+            mem.write_u32(info + 16, 1);
+            mem.write_u32(info + 20, 2048);
+            24
+        }
+        _ => {
+            for i in 0..length {
+                mem.write_u8(info + i, 0);
+            }
+            length
+        }
+    };
+
+    write_io_status_block(mem, io_status_block, STATUS_SUCCESS as u32, written);
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+/// Enumerate the immediate children of a directory handle, writing
+/// `X_FILE_DIRECTORY_INFORMATION` entries into the caller's buffer.
+/// Mirrors Canary [xboxkrnl_io.cc:516-557](xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc)
+/// and the entry layout in
+/// [xfile.h:35-73](xenia-canary/src/xenia/kernel/xfile.h).
+///
+/// Pagination: each call consumes `dir_enum_pos` on the File handle.
+/// `None` = fresh handle → start at index 0; `Some(N)` = resume from
+/// N-th matching entry. On exhaustion the cursor stays past the end
+/// and subsequent calls return `STATUS_NO_MORE_FILES`. The `restart_scan`
+/// flag (9th arg, on the stack) is not yet threaded through; callers
+/// that want to rescan must close and re-open the directory handle.
+fn nt_query_directory_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3=file_handle, r4=event_handle, r5=apc_routine, r6=apc_context,
+    // r7=io_status_block, r8=file_info_ptr, r9=length, r10=file_name,
+    // sp+... = restart_scan.
+    let handle = ctx.gpr[3] as u32;
+    let event_handle = ctx.gpr[4] as u32;
+    let iosb_ptr = ctx.gpr[7] as u32;
+    let info_ptr = ctx.gpr[8] as u32;
+    let length = ctx.gpr[9] as u32;
+
+    // Canary requires at least one fixed prefix + some filename room.
+    const ENTRY_FIXED_SIZE: u32 = 0x40; // bytes 0..64 fixed fields
+    const CANARY_MIN_LENGTH: u32 = 72; // xboxkrnl_io.cc:521
+    const FILE_ATTRIBUTE_DIRECTORY: u32 = 0x10;
+    const FILE_ATTRIBUTE_NORMAL: u32 = 0x80;
+    if length < CANARY_MIN_LENGTH {
+        ctx.gpr[3] = STATUS_INFO_LENGTH_MISMATCH;
+        signal_io_completion_event(state, event_handle);
+        return;
+    }
+
+    // Look up the handle and snapshot the directory prefix.
+    let dir_path = match state.objects.get(&handle) {
+        Some(KernelObject::File { path, .. }) => path.clone(),
+        _ => {
+            if iosb_ptr != 0 {
+                write_io_status_block(mem, iosb_ptr, STATUS_INVALID_HANDLE as u32, 0);
+            }
+            ctx.gpr[3] = STATUS_INVALID_HANDLE;
+            signal_io_completion_event(state, event_handle);
+            return;
+        }
+    };
+
+    // Gather the directory's immediate children from the VFS. An empty
+    // `dir_path` refers to the disc root; non-empty paths match entries
+    // whose name starts with `dir_path + "/"` and whose suffix (relative
+    // to that prefix) contains no further slashes.
+    let prefix: String = if dir_path.is_empty() {
+        String::new()
+    } else if dir_path.ends_with('/') {
+        dir_path.clone()
+    } else {
+        format!("{}/", dir_path)
+    };
+    let entries: Vec<xenia_vfs::VfsEntry> = match state.vfs.as_ref() {
+        Some(vfs) => vfs
+            .list_root()
+            .unwrap_or_default()
+            .into_iter()
+            .filter_map(|e| {
+                let relative: &str = if prefix.is_empty() {
+                    e.name.as_str()
+                } else {
+                    match e.name.strip_prefix(prefix.as_str()) {
+                        Some(s) => s,
+                        None => return None,
+                    }
+                };
+                if relative.is_empty() || relative.contains('/') {
+                    return None;
+                }
+                Some(xenia_vfs::VfsEntry {
+                    name: relative.to_string(),
+                    is_directory: e.is_directory,
+                    size: e.size,
+                    offset: e.offset,
+                })
+            })
+            .collect(),
+        None => Vec::new(),
+    };
+
+    // Load / initialise the enumeration cursor.
+    let start_index = match state.objects.get_mut(&handle) {
+        Some(KernelObject::File { dir_enum_pos, .. }) => {
+            let pos = dir_enum_pos.unwrap_or(0);
+            *dir_enum_pos = Some(pos);
+            pos
+        }
+        _ => 0,
+    };
+
+    if start_index >= entries.len() {
+        if iosb_ptr != 0 {
+            write_io_status_block(mem, iosb_ptr, STATUS_NO_MORE_FILES as u32, 0);
+        }
+        ctx.gpr[3] = STATUS_NO_MORE_FILES;
+        signal_io_completion_event(state, event_handle);
+        return;
+    }
+
+    // Pack as many entries as fit into `length`. `NextEntryOffset` is the
+    // byte distance to the next entry from the start of the current one;
+    // 0 marks the last entry. Entries are 8-byte aligned per Canary.
+    let mut cursor: u32 = 0;
+    let mut emitted: usize = 0;
+    let mut last_entry_offset: Option<u32> = None;
+    for (i, entry) in entries.iter().enumerate().skip(start_index) {
+        let name_bytes = entry.name.as_bytes();
+        let name_len = name_bytes.len() as u32;
+        let raw_size = ENTRY_FIXED_SIZE + name_len;
+        let aligned_size = (raw_size + 7) & !7;
+        if cursor + raw_size > length {
+            // Entry wouldn't fit — leave the buffer truncated and stop.
+            break;
+        }
+        let base = info_ptr + cursor;
+        mem.write_u32(base + 0x00, 0); // next_entry_offset (patched later)
+        mem.write_u32(base + 0x04, i as u32); // file_index
+        // Timestamps zeroed — xenia-rs doesn't track them.
+        mem.write_u64(base + 0x08, 0);
+        mem.write_u64(base + 0x10, 0);
+        mem.write_u64(base + 0x18, 0);
+        mem.write_u64(base + 0x20, 0);
+        mem.write_u64(base + 0x28, entry.size);
+        mem.write_u64(base + 0x30, entry.size);
+        let attrs = if entry.is_directory {
+            FILE_ATTRIBUTE_DIRECTORY
+        } else {
+            FILE_ATTRIBUTE_NORMAL
+        };
+        mem.write_u32(base + 0x38, attrs);
+        mem.write_u32(base + 0x3C, name_len);
+        for (k, &b) in name_bytes.iter().enumerate() {
+            mem.write_u8(base + ENTRY_FIXED_SIZE + k as u32, b);
+        }
+        // Patch the previous entry's next_entry_offset to point here.
+        if let Some(prev_base) = last_entry_offset {
+            mem.write_u32(prev_base + 0x00, cursor - (prev_base - info_ptr));
+        }
+        last_entry_offset = Some(base);
+        cursor = std::cmp::min(cursor + aligned_size, length);
+        emitted += 1;
+        if cursor + ENTRY_FIXED_SIZE > length {
+            // No room for another fixed header; stop before truncating.
+            break;
+        }
+    }
+
+    // Advance cursor on the handle.
+    if let Some(KernelObject::File { dir_enum_pos, .. }) = state.objects.get_mut(&handle) {
+        *dir_enum_pos = Some(start_index + emitted);
+    }
+
+    if emitted == 0 {
+        if iosb_ptr != 0 {
+            write_io_status_block(mem, iosb_ptr, STATUS_NO_MORE_FILES as u32, 0);
+        }
+        ctx.gpr[3] = STATUS_NO_MORE_FILES;
+    } else {
+        if iosb_ptr != 0 {
+            write_io_status_block(mem, iosb_ptr, STATUS_SUCCESS as u32, cursor);
+        }
+        ctx.gpr[3] = STATUS_SUCCESS;
+    }
+    signal_io_completion_event(state, event_handle);
+}
+
+fn nt_close(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
+    let handle = ctx.gpr[3] as u32;
+    // Aliased refcount: `NtDuplicateObject` returns the *source* handle as the
+    // "new" handle (we don't mint fresh values), so the game commonly holds
+    // two logical references to the same handle value. Without refcount, the
+    // first `NtClose` wipes the object while the second reference is still
+    // live, which traps any later wait on that handle (Sylpheed's
+    // create→dup(SAME_ACCESS)→set→close pattern at 0x8246079c manifests this
+    // — main thread then parks forever on the closed handle). Mirror Canary's
+    // `ObjectTable::ReleaseHandle` (object_table.cc:189): decrement the
+    // per-handle refcount and only drop the object when it reaches zero.
+    let remaining = state
+        .handle_refcount
+        .get_mut(&handle)
+        .map(|c| {
+            *c = c.saturating_sub(1);
+            *c
+        })
+        .unwrap_or(0);
+    if remaining == 0 {
+        state.objects.remove(&handle);
+        state.handle_refcount.remove(&handle);
+        // If the object was an armed Timer, strip its pending-fire entry
+        // so a later scheduler round doesn't try to signal a dead handle.
+        // `disarm_timer` is a no-op for non-timer handles.
+        state.disarm_timer(handle);
+    }
     ctx.gpr[3] = 0;
 }
 
-fn nt_create_event(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
+fn nt_create_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     // r3 = handle_ptr, r4 = obj_attrs, r5 = event_type, r6 = initial_state
     let handle_ptr = ctx.gpr[3] as u32;
     let manual_reset = ctx.gpr[5] != 0;
     let signaled = ctx.gpr[6] != 0;
-    let handle = state.alloc_handle_for(KernelObject::Event { manual_reset, signaled });
+    let handle = state.alloc_handle_for(KernelObject::Event {
+        manual_reset,
+        signaled,
+        waiters: Vec::new(),
+    });
+    state.audit_create(
+        handle,
+        if manual_reset { "Event/Manual" } else { "Event/Auto" },
+        ctx.lr as u32,
+        "NtCreateEvent",
+    );
     if handle_ptr != 0 {
         mem.write_u32(handle_ptr, handle);
     }
     ctx.gpr[3] = 0;
 }
 
-fn nt_create_semaphore(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
+fn nt_create_semaphore(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     // r3 = handle_ptr, r4 = obj_attrs, r5 = initial_count, r6 = max_count
     let handle_ptr = ctx.gpr[3] as u32;
     let count = ctx.gpr[5] as i32;
     let max = ctx.gpr[6] as i32;
-    let handle = state.alloc_handle_for(KernelObject::Semaphore { count, max });
+    let handle = state.alloc_handle_for(KernelObject::Semaphore {
+        count,
+        max,
+        waiters: Vec::new(),
+    });
+    state.audit_create(handle, "Semaphore", ctx.lr as u32, "NtCreateSemaphore");
     if handle_ptr != 0 {
         mem.write_u32(handle_ptr, handle);
     }
     ctx.gpr[3] = 0;
 }
 
-fn nt_create_timer(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
+/// `NtCreateTimer(OUT handle_ptr, obj_attributes, timer_type)` — mint a
+/// Timer kernel object in the handle table. `timer_type` selects between
+/// NotificationTimer (0, manual-reset) and SynchronizationTimer (1,
+/// auto-reset); any other value returns `STATUS_INVALID_PARAMETER`
+/// matching Canary's `assert_always` on bad types (xtimer.cc:32).
+/// Named-object dedup (Canary's `LookupNamedObject<XTimer>`) is out of
+/// scope — Sylpheed uses anonymous timers.
+fn nt_create_timer(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    const STATUS_INVALID_PARAMETER: u64 = 0xC000_000D;
     let handle_ptr = ctx.gpr[3] as u32;
-    let handle = state.alloc_handle_for(KernelObject::Timer);
+    let timer_type = ctx.gpr[5] as u32;
+    if timer_type > 1 {
+        ctx.gpr[3] = STATUS_INVALID_PARAMETER;
+        return;
+    }
+    let handle = state.alloc_handle_for(KernelObject::Timer {
+        manual_reset: timer_type == 0,
+        signaled: false,
+        deadline: None,
+        period_ticks: 0,
+        period_ms: 0,
+        callback_routine: 0,
+        callback_arg: 0,
+        waiters: Vec::new(),
+    });
+    state.audit_create(
+        handle,
+        if timer_type == 0 { "Timer/Manual" } else { "Timer/Auto" },
+        ctx.lr as u32,
+        "NtCreateTimer",
+    );
     if handle_ptr != 0 {
         mem.write_u32(handle_ptr, handle);
     }
-    ctx.gpr[3] = 0;
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+/// `NtSetTimerEx(handle, due_time_ptr, routine, mode, routine_arg, resume,
+/// period_ms, unk_zero)` — arm a Timer object. Mirrors Canary's
+/// [`NtSetTimerEx_entry`](xboxkrnl_threading.cc:897): reads i64 `due_time`
+/// (100ns units; negative = relative), converts to an absolute deadline
+/// on our tick timebase (same `/100` scale as `parse_timeout`), stores
+/// `period_ms` for periodic rearm, and registers the fire in
+/// `state.pending_timer_fires` via `arm_timer`.
+///
+/// APC delivery (`routine != 0`) is deferred — the timer still signals
+/// itself on fire, and any `Wait*`-on-the-timer-handle waiter wakes
+/// correctly. If a real-world probe shows `timer_apc` warns firing,
+/// that's the signal to lift the APC subsystem into its own PR.
+fn nt_set_timer_ex(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    const STATUS_INVALID_HANDLE: u64 = 0xC000_0008;
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let due_time_ptr = ctx.gpr[4] as u32;
+    let routine = ctx.gpr[5] as u32;
+    let _mode = ctx.gpr[6] as u32;
+    let routine_arg = ctx.gpr[7] as u32;
+    let _resume = ctx.gpr[8] as u32;
+    let period_ms = ctx.gpr[9] as u32;
+
+    // Look up handle + confirm it's a Timer. We pull the current hw's
+    // timebase separately (immutable borrow) before any mutation of the
+    // object to keep the borrow-checker happy.
+    let hw_id = state.scheduler.current_hw_id().unwrap_or(0);
+    let now = state.scheduler.ctx(hw_id).timebase;
+
+    // Read signed i64 due_time (big-endian hi/lo — same pattern as
+    // parse_timeout). Negative = relative-from-now, positive = absolute
+    // (FILETIME). We treat magnitude as relative for both signs; games on
+    // Xbox 360 overwhelmingly pass negative values for timers, and the
+    // positive-absolute path is handled best-effort for bring-up.
+    let hi = mem.read_u32(due_time_ptr) as i32;
+    let lo = mem.read_u32(due_time_ptr + 4);
+    let raw = ((hi as i64) << 32) | (lo as i64 & 0xFFFF_FFFF);
+    let magnitude = raw.unsigned_abs().max(1);
+    let abs_deadline = now.saturating_add(magnitude / 100);
+    // period_ms → ticks: ms × 1,000,000 ns / 100 ns-per-tick-divisor =
+    // ms × 10_000 (raw ticks) ÷ 100 (our scale factor) = ms × 100. Matches
+    // the same divisor `parse_timeout` applies.
+    let period_ticks = (period_ms as u64) * 100;
+
+    match state.objects.get_mut(&handle) {
+        Some(KernelObject::Timer {
+            signaled,
+            deadline,
+            period_ticks: obj_period_ticks,
+            period_ms: obj_period_ms,
+            callback_routine,
+            callback_arg,
+            ..
+        }) => {
+            *signaled = false;
+            *deadline = Some(abs_deadline);
+            *obj_period_ticks = period_ticks;
+            *obj_period_ms = period_ms;
+            *callback_routine = routine;
+            *callback_arg = routine_arg;
+        }
+        _ => {
+            ctx.gpr[3] = STATUS_INVALID_HANDLE;
+            return;
+        }
+    }
+
+    if routine != 0 {
+        tracing::warn!(
+            target: "timer_apc",
+            routine = format_args!("{:#010x}", routine),
+            arg = format_args!("{:#010x}", routine_arg),
+            handle = format_args!("{:#010x}", handle),
+            "NtSetTimerEx: routine != 0 — APC delivery deferred; timer self-signal still works"
+        );
+    }
+
+    state.arm_timer(handle, abs_deadline);
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+/// `NtCancelTimer(handle, OUT current_state_ptr)` — disarm a Timer. The
+/// OUT pointer receives `0` per Canary's
+/// [`NtCancelTimer_entry`](xboxkrnl_threading.cc:938-940), regardless of
+/// prior signaled state. The Timer object stays in the handle table
+/// (closed via NtClose); subsequent rearm via `NtSetTimerEx` is fine.
+fn nt_cancel_timer(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    const STATUS_INVALID_HANDLE: u64 = 0xC000_0008;
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let current_state_ptr = ctx.gpr[4] as u32;
+    match state.objects.get_mut(&handle) {
+        Some(KernelObject::Timer { deadline, .. }) => {
+            *deadline = None;
+        }
+        _ => {
+            ctx.gpr[3] = STATUS_INVALID_HANDLE;
+            return;
+        }
+    }
+    state.disarm_timer(handle);
+    if current_state_ptr != 0 {
+        mem.write_u32(current_state_ptr, 0);
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
 }
 
 // ===== RTL =====
 
-fn rtl_initialize_critical_section(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
-    // r3 = critical_section_ptr (28 bytes on Xbox 360)
+// ----- RTL_CRITICAL_SECTION layout (Xbox 360 NT): -----
+//  +0x00  DebugInfo (unused here)
+//  +0x04  LockCount (signed)
+//  +0x08  RecursionCount (signed; -1 while unlocked)
+//  +0x0C  OwningThread (guest thread id, 0 when free)
+//  +0x10  LockSemaphore (unused)
+//  +0x14  SpinCount
+//
+// We enforce real mutual exclusion by reading/writing OwningThread and
+// RecursionCount. Parked HW ids live in `KernelState::cs_waiters[cs_ptr]`.
+
+// X_RTL_CRITICAL_SECTION layout (28 bytes, Canary `xboxkrnl_rtl.cc:536-543`):
+//   +0x00: X_DISPATCH_HEADER (16 bytes)
+//          +0x00: type (u8) = 1 (EventSynchronizationObject / auto-reset)
+//          +0x01: absolute (u8) = spin-count/256
+//          +0x02: size (u8)
+//          +0x03: inserted (u8)
+//          +0x04: signal_state (i32)
+//          +0x08: WaitListHead (two u32 pointers)
+//   +0x10: lock_count (i32)      — starts at -1; first acquire → 0
+//   +0x14: recursion_count (i32) — starts at 0; first acquire → 1
+//   +0x18: owning_thread (u32)   — 0 unless held
+const CS_OFFS_TYPE: u32 = 0x00;
+const CS_OFFS_LOCK_COUNT: u32 = 0x10;
+const CS_OFFS_RECURSION_COUNT: u32 = 0x14;
+const CS_OFFS_OWNING_THREAD: u32 = 0x18;
+const CS_STRUCT_SIZE: u32 = 0x1C;
+
+fn rtl_initialize_critical_section(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    _state: &mut KernelState,
+) {
     let cs_ptr = ctx.gpr[3] as u32;
     if cs_ptr != 0 {
-        for i in (0..28).step_by(4) {
+        // Zero the whole struct, then set dispatcher type=1 and
+        // lock_count=-1 per Canary `xeRtlInitializeCriticalSection`.
+        for i in (0..CS_STRUCT_SIZE).step_by(4) {
             mem.write_u32(cs_ptr + i, 0);
         }
-        // Set recursion count to -1 (unlocked)
-        mem.write_u32(cs_ptr + 8, 0xFFFF_FFFF_u32);
+        mem.write_u8(cs_ptr + CS_OFFS_TYPE, 1);
+        mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, 0xFFFF_FFFF_u32); // -1
     }
     ctx.gpr[3] = 0;
 }
 
-fn rtl_enter_critical_section(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
-    // r3 = critical_section_ptr
-    // For single-threaded: increment lock count, always succeed
+fn rtl_enter_critical_section(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
     let cs_ptr = ctx.gpr[3] as u32;
-    if cs_ptr != 0 {
-        let lock_count = mem.read_u32(cs_ptr + 4) as i32;
-        mem.write_u32(cs_ptr + 4, (lock_count + 1) as u32);
-        let recursion = mem.read_u32(cs_ptr + 8) as i32;
-        mem.write_u32(cs_ptr + 8, (recursion + 1) as u32);
+    if cs_ptr == 0 {
+        ctx.gpr[3] = 0;
+        return;
+    }
+    let current_tid = ctx.thread_id;
+    let owner = mem.read_u32(cs_ptr + CS_OFFS_OWNING_THREAD);
+
+    // "Effective owner" — if the stored tid doesn't correspond to any live HW
+    // thread, the CS memory is either uninitialized (.data junk from the XEX
+    // image) or the previous owner already exited. Treat it as free.
+    let owner_is_live =
+        owner != 0 && state.scheduler.find_by_tid(owner).is_some();
+
+    if owner == 0 || !owner_is_live {
+        if owner != 0 {
+            tracing::debug!(
+                "rtl_enter_cs: cs={:#010x} stored owner={} has no live HW thread — claiming",
+                cs_ptr,
+                owner
+            );
+        }
+        mem.write_u32(cs_ptr + CS_OFFS_OWNING_THREAD, current_tid);
+        mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, 0); // -1 → 0 on first lock
+        mem.write_u32(cs_ptr + CS_OFFS_RECURSION_COUNT, 1);
+        ctx.gpr[3] = 0;
+        return;
+    }
+    if owner == current_tid {
+        let lc = mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32;
+        mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, (lc + 1) as u32);
+        let rc = mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT) as i32;
+        mem.write_u32(cs_ptr + CS_OFFS_RECURSION_COUNT, (rc + 1) as u32);
+        ctx.gpr[3] = 0;
+        return;
+    }
+    // Truly contended against a live peer — park.
+    let lc = mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32;
+    mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, (lc + 1) as u32);
+    let current_ref = state.scheduler.current_ref();
+    state
+        .cs_waiters
+        .entry(cs_ptr)
+        .or_default()
+        .push(current_ref);
+    tracing::debug!(
+        "rtl_enter_cs: hw={} park on cs={:#010x} owner_tid={}",
+        current_ref.hw_id,
+        cs_ptr,
+        owner
+    );
+    ctx.gpr[3] = 0;
+    state
+        .scheduler
+        .park_current(BlockReason::CriticalSection(cs_ptr));
+}
+
+fn rtl_leave_critical_section(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    let cs_ptr = ctx.gpr[3] as u32;
+    if cs_ptr == 0 {
+        ctx.gpr[3] = 0;
+        return;
+    }
+
+    let lc = mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32;
+    let rc = mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT) as i32;
+    if rc > 1 {
+        // Still nested; decrement both counts and keep ownership.
+        mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, (lc - 1) as u32);
+        mem.write_u32(cs_ptr + CS_OFFS_RECURSION_COUNT, (rc - 1) as u32);
+        ctx.gpr[3] = 0;
+        return;
+    }
+    // Fully releasing — wake the next waiter (if any) and transfer ownership.
+    mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, (lc - 1) as u32);
+    mem.write_u32(cs_ptr + CS_OFFS_RECURSION_COUNT, 0);
+    mem.write_u32(cs_ptr + CS_OFFS_OWNING_THREAD, 0);
+    if let Some(queue) = state.cs_waiters.get_mut(&cs_ptr)
+        && !queue.is_empty() {
+            let next_ref = queue.remove(0);
+            // Find the woken thread's guest tid and hand it the lock.
+            let next_tid = state.scheduler.thread(next_ref).tid;
+            mem.write_u32(cs_ptr + CS_OFFS_OWNING_THREAD, next_tid);
+            mem.write_u32(cs_ptr + CS_OFFS_RECURSION_COUNT, 1);
+            state.scheduler.wake_ref(next_ref);
+        }
+    ctx.gpr[3] = 0;
+}
+
+fn rtl_try_enter_critical_section(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    _state: &mut KernelState,
+) {
+    let cs_ptr = ctx.gpr[3] as u32;
+    if cs_ptr == 0 {
+        ctx.gpr[3] = 0;
+        return;
+    }
+    let current_tid = ctx.thread_id;
+    let owner = mem.read_u32(cs_ptr + CS_OFFS_OWNING_THREAD);
+    if owner == 0 {
+        mem.write_u32(cs_ptr + CS_OFFS_OWNING_THREAD, current_tid);
+        mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, 0);
+        mem.write_u32(cs_ptr + CS_OFFS_RECURSION_COUNT, 1);
+        ctx.gpr[3] = 1;
+        return;
+    }
+    if owner == current_tid {
+        let lc = mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32;
+        mem.write_u32(cs_ptr + CS_OFFS_LOCK_COUNT, (lc + 1) as u32);
+        let rc = mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT) as i32;
+        mem.write_u32(cs_ptr + CS_OFFS_RECURSION_COUNT, (rc + 1) as u32);
+        ctx.gpr[3] = 1;
+        return;
     }
     ctx.gpr[3] = 0;
 }
 
-fn rtl_leave_critical_section(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
-    let cs_ptr = ctx.gpr[3] as u32;
-    if cs_ptr != 0 {
-        let lock_count = mem.read_u32(cs_ptr + 4) as i32;
-        mem.write_u32(cs_ptr + 4, (lock_count - 1) as u32);
-        let recursion = mem.read_u32(cs_ptr + 8) as i32;
-        mem.write_u32(cs_ptr + 8, (recursion - 1) as u32);
-    }
-    ctx.gpr[3] = 0;
-}
-
-fn rtl_try_enter_critical_section(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
-    // Always succeed in single-threaded mode
-    let cs_ptr = ctx.gpr[3] as u32;
-    if cs_ptr != 0 {
-        let lock_count = mem.read_u32(cs_ptr + 4) as i32;
-        mem.write_u32(cs_ptr + 4, (lock_count + 1) as u32);
-        let recursion = mem.read_u32(cs_ptr + 8) as i32;
-        mem.write_u32(cs_ptr + 8, (recursion + 1) as u32);
-    }
-    ctx.gpr[3] = 1; // TRUE
-}
-
-fn rtl_init_ansi_string(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_init_ansi_string(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     let dest_ptr = ctx.gpr[3] as u32;
     let src_ptr = ctx.gpr[4] as u32;
     if src_ptr != 0 {
@@ -531,7 +1779,7 @@ fn rtl_init_ansi_string(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mu
     }
 }
 
-fn rtl_init_unicode_string(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_init_unicode_string(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     let dest_ptr = ctx.gpr[3] as u32;
     let src_ptr = ctx.gpr[4] as u32;
     if src_ptr != 0 {
@@ -551,7 +1799,7 @@ fn rtl_init_unicode_string(ctx: &mut PpcContext, mem: &mut GuestMemory, _state:
     }
 }
 
-fn rtl_capture_context(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_capture_context(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = context_ptr — write CPU registers to CONTEXT structure
     let ptr = ctx.gpr[3] as u32;
     if ptr != 0 {
@@ -562,7 +1810,7 @@ fn rtl_capture_context(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut
     }
 }
 
-fn rtl_compare_memory_ulong(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_compare_memory_ulong(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = source, r4 = length, r5 = pattern
     let source = ctx.gpr[3] as u32;
     let length = ctx.gpr[4] as u32;
@@ -579,7 +1827,7 @@ fn rtl_compare_memory_ulong(ctx: &mut PpcContext, mem: &mut GuestMemory, _state:
     ctx.gpr[3] = matched as u64;
 }
 
-fn rtl_fill_memory_ulong(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_fill_memory_ulong(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = destination, r4 = length, r5 = pattern
     let dest = ctx.gpr[3] as u32;
     let length = ctx.gpr[4] as u32;
@@ -590,13 +1838,13 @@ fn rtl_fill_memory_ulong(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &m
     }
 }
 
-fn rtl_image_xex_header_field(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_image_xex_header_field(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = xex_header_ptr, r4 = field_id
     // Return 0 for all fields
     ctx.gpr[3] = 0;
 }
 
-fn rtl_multi_byte_to_unicode_n(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_multi_byte_to_unicode_n(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = unicode_str, r4 = max_bytes_out, r5 = bytes_written_ptr
     // r6 = multi_byte_str, r7 = multi_byte_len
     let uni_ptr = ctx.gpr[3] as u32;
@@ -617,7 +1865,7 @@ fn rtl_multi_byte_to_unicode_n(ctx: &mut PpcContext, mem: &mut GuestMemory, _sta
     ctx.gpr[3] = 0;
 }
 
-fn rtl_nt_status_to_dos_error(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_nt_status_to_dos_error(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     // Simple mapping for common cases
     let status = ctx.gpr[3] as u32;
     ctx.gpr[3] = match status {
@@ -628,17 +1876,224 @@ fn rtl_nt_status_to_dos_error(ctx: &mut PpcContext, _mem: &mut GuestMemory, _sta
     };
 }
 
-fn rtl_raise_exception(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    tracing::warn!("RtlRaiseException: record_ptr={:#010x}", ctx.gpr[3]);
-    // Don't halt — just log and return
+fn rtl_raise_exception(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // X_EXCEPTION_RECORD layout (big-endian, Xbox; mirrors
+    // xenia-canary/src/xenia/kernel/kernel.h:227-236, total 0x50 bytes):
+    //   +0x00 DWORD ExceptionCode
+    //   +0x04 DWORD ExceptionFlags
+    //   +0x08 PVOID ExceptionRecord (chain)
+    //   +0x0C PVOID ExceptionAddress
+    //   +0x10 DWORD NumberParameters
+    //   +0x14 ULONG_PTR ExceptionInformation[15]   <-- info[0] starts here
+    //
+    // For MSVC C++ throws (code = 0xE06D7363) the parameter convention is:
+    //   info[0] = magic           (0x19930520)
+    //   info[1] = thrown object pointer
+    //   info[2] = ThrowInfo*       (TI descriptor in .rdata)
+    let record_ptr = ctx.gpr[3] as u32;
+    if record_ptr == 0 {
+        tracing::warn!(tid = ctx.thread_id, "RtlRaiseException: null record");
+        return;
+    }
+    let code = mem.read_u32(record_ptr);
+    let flags = mem.read_u32(record_ptr + 0x04);
+    let addr = mem.read_u32(record_ptr + 0x0C);
+    let nparams = mem.read_u32(record_ptr + 0x10);
+    let info0 = if nparams > 0 { mem.read_u32(record_ptr + 0x14) } else { 0 };
+    let info1 = if nparams > 1 { mem.read_u32(record_ptr + 0x18) } else { 0 };
+    let info2 = if nparams > 2 { mem.read_u32(record_ptr + 0x1C) } else { 0 };
+
+    tracing::warn!(
+        tid = ctx.thread_id,
+        record = format_args!("{record_ptr:#010x}"),
+        code = format_args!("{code:#010x}"),
+        flags = format_args!("{flags:#010x}"),
+        exception_addr = format_args!("{addr:#010x}"),
+        caller_lr = format_args!("{:#010x}", ctx.lr as u32),
+        nparams,
+        info0 = format_args!("{info0:#010x}"),
+        info1 = format_args!("{info1:#010x}"),
+        info2 = format_args!("{info2:#010x}"),
+        "RtlRaiseException (stubbed return)",
+    );
+
+    // One-shot deep diagnostic for MSVC C++ throws. Mirrors the latch
+    // pattern used elsewhere (see render.rs:693-707 first_dispatch_logged).
+    // Fires once per process start; subsequent throws still log the
+    // header line above but don't repeat the expensive stack walk + decode.
+    if code == 0xE06D_7363 && !state.cxx_throw_logged {
+        state.cxx_throw_logged = true;
+
+        // Walk the PPC frame chain ~6 levels back from r1.
+        // PPC/EABI prologue: `mflr r12; stw r12, -8(r1); stwu r1, -F(r1)`.
+        // After prologue, [r1] = back-chain to old_r1, and the LR saved
+        // in *that* frame's prologue lives at [old_r1 - 8].
+        // Walking up: prev_sp = mem.read_u32(sp);
+        //             saved_lr_for_that_frame = mem.read_u32(prev_sp - 8);
+        // Level 0 is the live frame: its return address is in ctx.lr
+        // (no need to read the stack).
+        let mut frames: Vec<(u32, u32)> = Vec::with_capacity(8);
+        frames.push((ctx.gpr[1] as u32, ctx.lr as u32));
+        let mut sp = ctx.gpr[1] as u32;
+        for _ in 0..6 {
+            if sp == 0 || sp == 0xFFFF_FFFF { break; }
+            let prev_sp = mem.read_u32(sp);
+            if prev_sp == 0 || prev_sp == sp || prev_sp == 0xFFFF_FFFF {
+                break;
+            }
+            let saved_lr = mem.read_u32(prev_sp.wrapping_sub(8));
+            frames.push((prev_sp, saved_lr));
+            sp = prev_sp;
+        }
+        for (i, (fp, lr)) in frames.iter().enumerate() {
+            tracing::warn!(
+                level = i,
+                frame_ptr = format_args!("{fp:#010x}"),
+                saved_lr = format_args!("{lr:#010x}"),
+                "cxx_throw stack frame",
+            );
+        }
+
+        // Extract lhs — the "not valid instance" pointer — from __CxxThrow wrapper's
+        // saved r30. sub_825F23D8 (__CxxThrow) does `std r30, -24(r1)` in its prologue
+        // where r1 = sub_82454770's current SP = frames[2].0 (L2 frame pointer).
+        // `std` is a 64-bit big-endian store; the 32-bit guest address is in the
+        // lower 4 bytes at [frames[2].0 - 24 + 4] = [frames[2].0 - 20].
+        if frames.len() >= 3 {
+            let l2_fp = frames[2].0;
+            let lhs = mem.read_u32(l2_fp.wrapping_sub(20));
+            tracing::warn!(
+                l2_fp = format_args!("{l2_fp:#010x}"),
+                lhs = format_args!("{lhs:#010x}"),
+                "cxx_throw lhs (not-registered instance)",
+            );
+
+            // Walk the instance registry BST at 0x828F3DA8 to show what IS registered.
+            // Layout: [+0..+27]=CriticalSection (28 bytes), [+28..+31]=some field,
+            // [+32]=sentinel heap ptr, [+36]=node count.
+            // Sentinel (heap-allocated): [+0]=left,[+4]=next,[+8]=right,[+12]=key,[+17]=is_valid(1).
+            // A real node has is_valid=0.
+            let registry_base = 0x828F3DA8_u32;
+            let sentinel_ptr = mem.read_u32(registry_base + 32);
+            let node_count = mem.read_u32(registry_base + 36);
+            tracing::warn!(
+                sentinel = format_args!("{sentinel_ptr:#010x}"),
+                node_count,
+                "cxx_throw registry state",
+            );
+            if sentinel_ptr != 0 {
+                // Replicate validator sub_82454600's BST ceil search:
+                // Find min key >= lhs. If candidate_key == lhs → should be valid.
+                let root = mem.read_u32(sentinel_ptr.wrapping_add(4));
+                let mut node = root;
+                let mut candidate = sentinel_ptr; // "no candidate" marker
+                let mut steps = 0_u32;
+                loop {
+                    if mem.read_u8(node.wrapping_add(17)) != 0 {
+                        break; // sentinel (is_valid != 0)
+                    }
+                    if steps >= 128 {
+                        break; // guard against runaway
+                    }
+                    let key = mem.read_u32(node.wrapping_add(12));
+                    if key >= lhs {
+                        candidate = node;
+                        node = mem.read_u32(node); // go left (node[+0])
+                    } else {
+                        node = mem.read_u32(node.wrapping_add(8)); // go right (node[+8])
+                    }
+                    steps += 1;
+                }
+                let (candidate_key, candidate_is_sentinel) = if candidate != sentinel_ptr {
+                    (mem.read_u32(candidate.wrapping_add(12)), false)
+                } else {
+                    (0, true)
+                };
+                tracing::warn!(
+                    root = format_args!("{root:#010x}"),
+                    root_key = format_args!("{:#010x}", mem.read_u32(root.wrapping_add(12))),
+                    lhs = format_args!("{lhs:#010x}"),
+                    candidate = format_args!("{candidate:#010x}"),
+                    candidate_key = format_args!("{candidate_key:#010x}"),
+                    candidate_is_sentinel,
+                    steps,
+                    match_found = (candidate_key == lhs && !candidate_is_sentinel),
+                    "cxx_throw BST ceil search",
+                );
+            } else {
+                tracing::warn!("cxx_throw registry: sentinel_ptr is null");
+            }
+        }
+
+        // Decode runtime_error::what() — verified layout via the
+        // destructor at sub_8216DBC0 (it does `addi r3, obj, 12`
+        // before calling the std::string destructor). MSVC layout
+        // for this CRT:
+        //   +0x00  vtbl*
+        //   +0x04  char* _Mywhat   (lazy; set by what(); often 0 at throw)
+        //   +0x08  uint8_t _Mydofree
+        //   +0x0C  std::string _Mystr {
+        //              union _Bx { char _Buf[16]; char* _Ptr; }  (+0x0C..+0x1C)
+        //              size_t _Mysize                              (+0x1C)
+        //              size_t _Myres                               (+0x20)  capacity
+        //          }
+        // SSO: when _Myres < 16, chars are inline at +0x0C; otherwise
+        // +0x0C is a heap char*. Log BOTH interpretations + raw
+        // _Mysize/_Myres so the right one is obvious from the values.
+        if info1 != 0 {
+            let mut sso_buf = [0u8; 16];
+            mem.read_bytes(info1.wrapping_add(0x0C), &mut sso_buf);
+            let nul = sso_buf.iter().position(|&b| b == 0).unwrap_or(16);
+            let sso_msg = String::from_utf8_lossy(&sso_buf[..nul]).into_owned();
+
+            let heap_ptr = mem.read_u32(info1.wrapping_add(0x0C));
+            let heap_msg = if heap_ptr != 0
+                && heap_ptr != info1.wrapping_add(0x0C)
+                && (0x10000..0xC000_0000).contains(&heap_ptr)
+            {
+                read_cstring(mem, heap_ptr)
+            } else {
+                String::new()
+            };
+
+            let mysize = mem.read_u32(info1.wrapping_add(0x1C));
+            let myres = mem.read_u32(info1.wrapping_add(0x20));
+            let mywhat = mem.read_u32(info1.wrapping_add(0x04));
+            let mywhat_str = if mywhat != 0 && (0x10000..0xC000_0000).contains(&mywhat) {
+                read_cstring(mem, mywhat)
+            } else {
+                String::new()
+            };
+
+            tracing::warn!(
+                obj = format_args!("{info1:#010x}"),
+                throwinfo = format_args!("{info2:#010x}"),
+                magic = format_args!("{info0:#010x}"),
+                mysize,
+                myres,
+                heap_ptr = format_args!("{heap_ptr:#010x}"),
+                mywhat_ptr = format_args!("{mywhat:#010x}"),
+                mywhat = %mywhat_str,
+                sso_msg = %sso_msg,
+                heap_msg = %heap_msg,
+                "cxx_throw runtime_error decoded",
+            );
+        }
+    }
+
+    // Keep the existing stub-return semantics: Canary's RtlRaiseException
+    // also returns rather than unwinds (xboxkrnl_debug.cc:131-151 — the
+    // TODO comment there reads "unwinding. This is going to suck.").
+    // The Canary-aligned path is to fix the upstream HLE that triggered
+    // the throw, not to implement SEH dispatch here.
 }
 
-fn rtl_unwind(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn rtl_unwind(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::warn!("RtlUnwind: target_frame={:#010x}", ctx.gpr[3]);
     // Stub — in a real implementation this would walk the stack
 }
 
-fn stub_sprintf(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn stub_sprintf(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     let dest = ctx.gpr[3] as u32;
     let fmt = ctx.gpr[4] as u32;
     if fmt != 0 && dest != 0 {
@@ -655,7 +2110,7 @@ fn stub_sprintf(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut Kernel
     ctx.gpr[3] = 0;
 }
 
-fn stub_vsnprintf(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn stub_vsnprintf(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = buffer, r4 = count, r5 = format, r6 = va_list
     let dest = ctx.gpr[3] as u32;
     let fmt = ctx.gpr[5] as u32;
@@ -675,7 +2130,32 @@ fn stub_vsnprintf(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut Kern
 
 // ===== Video =====
 
-fn vd_query_video_mode(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+/// `VdGetCurrentDisplayGamma(type_ptr, power_ptr)` — matches Canary's
+/// impl (xboxkrnl_video.cc:119). Writes the active gamma ramp kind and
+/// its power exponent. Returning without writing leaves stack garbage for
+/// the game to consume; Sylpheed's boot sequence branches on the type and,
+/// with uninitialized bytes, takes the "unknown gamma → abort init" exit
+/// path — `main()` then returns to the CRT entry and the title terminates
+/// before the render loop starts.
+fn vd_get_current_display_gamma(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    _state: &mut KernelState,
+) {
+    let type_ptr = ctx.gpr[3] as u32;
+    let power_ptr = ctx.gpr[4] as u32;
+    if type_ptr != 0 {
+        mem.write_u32(type_ptr, 2); // BT.709 / TV gamma — the Xbox 360 default
+    }
+    if power_ptr != 0 {
+        // float 2.22222 ≈ 0x4011C720, matches Canary's
+        // `kernel_display_gamma_power` cvar default.
+        mem.write_u32(power_ptr, 0x4011_C720);
+    }
+    ctx.gpr[3] = 0;
+}
+
+fn vd_query_video_mode(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     let mode_ptr = ctx.gpr[3] as u32;
     if mode_ptr != 0 {
         mem.write_u32(mode_ptr, 1280);
@@ -687,46 +2167,395 @@ fn vd_query_video_mode(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut
     ctx.gpr[3] = 0;
 }
 
-fn vd_get_system_command_buffer(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
-    // r3 = cmd_buffer_ptr_ptr, r4 = cmd_buffer_size_ptr
-    let buf_ptr_ptr = ctx.gpr[3] as u32;
-    let buf_size_ptr = ctx.gpr[4] as u32;
-
-    if state.gpu_command_buffer == 0 {
-        // Allocate a 64KB command buffer
-        if let Some(addr) = state.heap_alloc(0x10000, mem) {
-            state.gpu_command_buffer = addr;
+fn vd_get_system_command_buffer(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // Matches `VdGetSystemCommandBuffer_entry` in
+    // `xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:330-334`:
+    //   void VdGetSystemCommandBuffer_entry(lpunknown_t p0_ptr, lpunknown_t p1_ptr) {
+    //     p0_ptr.Zero(0x94);
+    //     xe::store_and_swap<uint32_t>(p0_ptr, 0xBEEF0000);
+    //     xe::store_and_swap<uint32_t>(p1_ptr, 0xBEEF0001);
+    //   }
+    // Games pass two out-pointers; the first points at a 148-byte block they
+    // expect zeroed, and the first dword of each block is a "token" that
+    // xenia-canary hard-codes. The tokens aren't further dereferenced — they
+    // are later fed back to Vd* calls and checked for non-zero.
+    let p0_ptr = ctx.gpr[3] as u32;
+    let p1_ptr = ctx.gpr[4] as u32;
+    if p0_ptr != 0 {
+        for i in (0..0x94u32).step_by(4) {
+            mem.write_u32(p0_ptr + i, 0);
         }
+        mem.write_u32(p0_ptr, 0xBEEF_0000);
     }
+    if p1_ptr != 0 {
+        mem.write_u32(p1_ptr, 0xBEEF_0001);
+    }
+    state.gpu_command_buffer = p0_ptr; // kept for informational use in --ui HUD
+    ctx.gpr[3] = 0;
+}
 
-    if buf_ptr_ptr != 0 {
-        mem.write_u32(buf_ptr_ptr, state.gpu_command_buffer);
-    }
-    if buf_size_ptr != 0 {
-        mem.write_u32(buf_size_ptr, 0x10000);
+fn vd_is_hsio_training_succeeded(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
+    ctx.gpr[3] = 1; // TRUE
+}
+
+fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
+    // Matches `VdInitializeRingBuffer_entry` at
+    // `xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:313-319`:
+    //   r3 = ring buffer guest address (physical, WRITE_COMBINE)
+    //   r4 = log2(size) in bytes
+    let ptr = ctx.gpr[3] as u32;
+    let size_log2 = ctx.gpr[4] as u32;
+    state.gpu.initialize_ring_buffer(ptr, size_log2);
+    ctx.gpr[3] = 0;
+}
+
+fn vd_enable_ring_buffer_rptr_writeback(
+    ctx: &mut PpcContext,
+    _mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // Matches `VdEnableRingBufferRPtrWriteBack_entry` at
+    // `xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:322-326`.
+    let ptr = ctx.gpr[3] as u32;
+    let block_log2 = ctx.gpr[4] as u32;
+    state.gpu.enable_rptr_writeback(ptr, block_log2);
+    ctx.gpr[3] = 0;
+}
+
+fn vd_set_graphics_interrupt_callback(
+    ctx: &mut PpcContext,
+    _mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = callback, r4 = user_data. P6: store the callback so the synthetic
+    // v-sync ticker + PM4_INTERRUPT path can invoke it. Zero means "unregister".
+    let cb = ctx.gpr[3] as u32;
+    let user = ctx.gpr[4] as u32;
+    if cb == 0 {
+        state.interrupts.callback = None;
+        tracing::info!("VdSetGraphicsInterruptCallback: unregistered");
+    } else {
+        state.interrupts.set_callback(cb, user);
+        tracing::info!(
+            "VdSetGraphicsInterruptCallback({:#010x}, {:#010x}) — callback armed",
+            cb,
+            user
+        );
     }
     ctx.gpr[3] = 0;
 }
 
-fn vd_is_hsio_training_succeeded(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    ctx.gpr[3] = 1; // TRUE
-}
+fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // Argument order from xenia-canary VdSwap_entry:
+    //   r3  = buffer_ptr        (slot the game reserved in the primary ring)
+    //   r4  = fetch_ptr         (6-dword D3D9 texture fetch header)
+    //   r5  = unk2              (system writeback ptr — ignored here)
+    //   r6  = unk3              (system cmd buf — ignored)
+    //   r7  = unk4              (system cmd buf — ignored)
+    //   r8  = frontbuffer_ptr   (*u32, guest writes its virtual FB address)
+    //   r9  = texture_format_ptr(*u32)
+    //   r10 = color_space_ptr   (*u32)
+    //   stack[0] = width_ptr    (*u32) — we decode from fetch instead
+    //   stack[1] = height_ptr   (*u32) — same
+    let buffer_ptr = ctx.gpr[3] as u32;
+    let fetch_ptr = ctx.gpr[4] as u32;
+    let frontbuffer_ptr = ctx.gpr[8] as u32;
+    let texture_format_ptr = ctx.gpr[9] as u32;
+    let color_space_ptr = ctx.gpr[10] as u32;
 
-fn vd_swap(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    tracing::info!("VdSwap (frame boundary)");
+    // Decode the D3D9 texture fetch header — 6 dwords. The interesting bits
+    // are base_address (dword_1) and size_2d (dword_2). Mirrors
+    // xenia-canary/src/xenia/gpu/xenos.h xe_gpu_texture_fetch_t.
+    let mut fetch_dwords = [0u32; 6];
+    if fetch_ptr != 0 {
+        for (i, slot) in fetch_dwords.iter_mut().enumerate() {
+            *slot = mem.read_u32(fetch_ptr + (i as u32) * 4);
+        }
+    }
+    // dword_1 bits 12:31 hold base_address shifted right by 12.
+    let frontbuffer_virt = (fetch_dwords[1] >> 12) << 12;
+    // dword_2: width in bits 0..12 (width-1), height in bits 13..25 (height-1).
+    // Fall back to the reported video mode when the fetch is empty.
+    let (width, height) = if fetch_dwords[2] != 0 {
+        let w = (fetch_dwords[2] & 0x1FFF) + 1;
+        let h = ((fetch_dwords[2] >> 13) & 0x1FFF) + 1;
+        (w, h)
+    } else {
+        (1280, 720)
+    };
+    // The guest also writes the virtual frontbuffer address to *frontbuffer_ptr.
+    // Prefer that when the fetch-derived address is zero.
+    let frontbuffer_addr = if frontbuffer_virt != 0 {
+        frontbuffer_virt
+    } else if frontbuffer_ptr != 0 {
+        mem.read_u32(frontbuffer_ptr)
+    } else {
+        0
+    };
+    let texture_format = if texture_format_ptr != 0 {
+        mem.read_u32(texture_format_ptr)
+    } else {
+        0
+    };
+    let color_space = if color_space_ptr != 0 {
+        mem.read_u32(color_space_ptr)
+    } else {
+        0
+    };
+
+    // First-Pixels M2b — two-part commit path.
+    //
+    // 1) Fill the guest's reserved 64-dword slot with PM4 Type-2 NOPs
+    //    (0x8000_0000). Some titles consume `buffer_ptr..+256` after
+    //    VdSwap returns and assume they're skippable. Matches the prior
+    //    behaviour.
+    if buffer_ptr != 0 {
+        for i in 0..64u32 {
+            mem.write_u32(buffer_ptr + i * 4, 0x8000_0000);
+        }
+    }
+
+    // 2) Advance the ring's write pointer by 64 dwords (the slot the
+    //    game "reserved" via VdSwap's buffer_ptr convention). Despite
+    //    `buffer_ptr` being in the system command buffer rather than the
+    //    primary ring, the 64-dword bump correctly exposes packets the
+    //    game wrote into the primary ring since our last `sync_with_mmio`.
+    //    Empirically (pre-M2b) this path drained 512 packets through 1 B
+    //    guest instructions — the setup packets that D3D9-init writes.
+    //
+    //    M1.5: bump routes through the shared MMIO atomic so both
+    //    backends produce the same observable WPTR sequence. Inline
+    //    picks it up on its next `sync_with_mmio`; threaded's worker
+    //    observes the same atomic.
+    state.gpu.extend_write_ptr_by(64);
+
+    // Drain the exposed packets — the D3D9-init setup the game batched
+    // into the ring plus any leftovers. The synthetic `PM4_XE_SWAP`
+    // packet the prior code wrote at `buffer_ptr` is **not** written
+    // anymore; the drain's `ring.base + rptr*4` walk couldn't find it
+    // anyway (see the pre-M2b `swaps=0 with packets=512` failure mode).
+    //
+    // M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`.
+    // Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout
+    // on CPU; 900 ms internal deadline on worker).
+    let drained = state.gpu.drain_to_current_wptr(mem);
+    tracing::debug!(drained, "VdSwap: drained PM4 packets");
+
+    // 3) Fire the swap notification — bumps `swaps_seen`, records
+    //    `last_swap`, enqueues an `InterruptSource::Swap` interrupt for
+    //    the scheduler-round graphics callback path. M1.5: backend-aware;
+    //    threaded sends `NotifyXeSwap` (fire-and-forget).
+    if frontbuffer_addr != 0 && width > 0 && height > 0 {
+        state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
+    }
+
+    // The remaining vd_swap work (UI publish: shader blobs, constants,
+    // texture cache, frontbuffer detile, ui.notify_swap) reads
+    // `state.gpu`'s internal state directly. In threaded mode that state
+    // lives on the worker thread; the UI bridge itself is `None` under
+    // `--gpu-thread` today (run_with_ui panics if both flags are set), so
+    // the early-return below is exact rather than a workaround.
+    let Some(gpu_inline) = state.gpu.as_inline_mut() else {
+        ctx.gpr[3] = 0;
+        return;
+    };
+
+    // Prefer the swap info the executor learned from PM4_XE_SWAP (that's
+    // the source of truth after draining).
+    let swap = gpu_inline.last_swap.unwrap_or(xenia_gpu::SwapNotification {
+        frame_index: gpu_inline.swap_counter,
+        frontbuffer_phys: frontbuffer_addr,
+        width,
+        height,
+    });
+
+    // P3b: publish the shader blob map + constants snapshot to the UI so
+    // the Xenos uber-shader has what it needs to execute captured draws.
+    // Do this before `notify_swap` so by the time the UI processes the
+    // SwapInfo the matching assets are visible through `UiHandles`.
+    if let Some(ref ui) = state.ui {
+        let blobs: std::collections::HashMap<u32, Vec<u32>> = gpu_inline
+            .shader_blobs
+            .iter()
+            .map(|(k, b)| (*k, b.dwords.clone()))
+            .collect();
+        let constants = xenia_gpu::xenos_constants::XenosConstantsBlock::snapshot(
+            &gpu_inline.register_file,
+        );
+        ui.publish_assets(blobs, constants);
+
+        // P5: try to decode the primary texture (fetch constant slot 0).
+        // Slot 0 is the convention most games use for their main bound
+        // texture at draw time; full N-slot binding waits for P6+. If the
+        // slot is unset or the format isn't supported (magenta stub kicks
+        // in host-side), we skip.
+        //
+        // Texture fetch constants live at `CONST_BASE_FETCH + slot*6` in
+        // the register file; we read the 6 dwords, decode the key, hit
+        // the CPU cache (with page-version freshness), and clone the
+        // decoded bytes across the bridge.
+        const TEX_SLOT: u32 = 0;
+        let mut fetch6 = [0u32; 6];
+        for (i, slot) in fetch6.iter_mut().enumerate() {
+            *slot = gpu_inline
+                .register_file
+                .read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32);
+        }
+        let published = if let Some(key) = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)
+        {
+            // Span over the entire tiled texture footprint to pick the
+            // max page version covering it.
+            let bi = key.format.block_info();
+            let span_bytes = (key.pitch_texels as u32)
+                * (key.height as u32)
+                * (bi.bytes_per_block as u32)
+                / (bi.block_w as u32);
+            let version = mem.max_page_version(key.base_address, span_bytes.max(4));
+            match gpu_inline.texture_cache.ensure_cached(key, version, mem) {
+                Ok(entry) => Some((entry.key, entry.bytes.clone())),
+                Err(e) => {
+                    metrics::counter!(
+                        "gpu.texture.reject",
+                        "reason" => format!("{:?}", e),
+                    )
+                    .increment(1);
+                    None
+                }
+            }
+        } else {
+            None
+        };
+        metrics::gauge!("gpu.texture_cache.entries")
+            .set(gpu_inline.texture_cache.len() as f64);
+        ui.publish_texture(published);
+    }
+    // Notify the UI.
+    if let Some(ui) = state.ui.clone() {
+        let (last_prim, last_verts) = match gpu_inline.last_draw {
+            Some(ds) => {
+                // PrimitiveType variants without Display; encode as raw bits.
+                let code = match ds.primitive {
+                    xenia_gpu::draw_state::PrimitiveType::None => 0,
+                    xenia_gpu::draw_state::PrimitiveType::PointList => 1,
+                    xenia_gpu::draw_state::PrimitiveType::LineList => 2,
+                    xenia_gpu::draw_state::PrimitiveType::LineStrip => 3,
+                    xenia_gpu::draw_state::PrimitiveType::TriangleList => 4,
+                    xenia_gpu::draw_state::PrimitiveType::TriangleFan => 5,
+                    xenia_gpu::draw_state::PrimitiveType::TriangleStrip => 6,
+                    xenia_gpu::draw_state::PrimitiveType::RectangleList => 8,
+                    xenia_gpu::draw_state::PrimitiveType::QuadList => 13,
+                    xenia_gpu::draw_state::PrimitiveType::Unknown(x) => x as u32,
+                };
+                (code, ds.vertex_count)
+            }
+            None => (0, 0),
+        };
+        let instructions_total: u64 = state
+            .scheduler
+            .slots
+            .iter()
+            .flat_map(|slot| slot.runqueue.iter())
+            .map(|t| t.ctx.cycle_count)
+            .sum();
+        // P4: CPU-side detile of the guest frontbuffer. We treat the
+        // frontbuffer as a tiled k_8_8_8_8 image (the overwhelmingly
+        // common format games resolve to), read it out of guest memory,
+        // run it through `tiled_2d` / `detile_2d`, and hand the resulting
+        // linear RGBA8 bytes to the UI via a dedicated bridge closure.
+        // The UI upgrades the previous "no frontbuffer content" placeholder
+        // path to real game output. Failures (OOB reads, malformed fetch
+        // headers) silently skip the publish.
+        if swap.frontbuffer_phys != 0 && swap.width > 0 && swap.height > 0 {
+            let pitch_aligned =
+                xenia_gpu::tiled_address::align_pitch_to_macro_tile(swap.width);
+            let total_tiled_bytes = (pitch_aligned * swap.height * 4) as usize;
+            // The guest address is 32-bit virtual but in the physical heap;
+            // safer to cap the read at the known total size to avoid OOB.
+            let mut tiled = Vec::with_capacity(total_tiled_bytes);
+            let mut ok = true;
+            for i in 0..total_tiled_bytes {
+                // read_u8 is cheap — the VirtualMemory handler returns 0
+                // for unmapped pages so we get a recognisable dark frame
+                // rather than a crash if the address turned out bogus.
+                let addr = swap.frontbuffer_phys.wrapping_add(i as u32);
+                tiled.push(mem.read_u8(addr));
+                if addr < swap.frontbuffer_phys {
+                    ok = false;
+                    break;
+                }
+            }
+            if ok {
+                let mut linear = vec![0u8; (swap.width * swap.height * 4) as usize];
+                if xenia_gpu::tiled_address::detile_2d(
+                    &tiled,
+                    &mut linear,
+                    swap.width,
+                    swap.height,
+                    pitch_aligned,
+                    4,
+                )
+                .is_ok()
+                {
+                    ui.publish_frontbuffer(swap.width, swap.height, linear);
+                }
+            }
+        }
+        ui.notify_swap(
+            crate::ui_bridge::SwapInfo {
+                frontbuffer_addr: swap.frontbuffer_phys,
+                width: swap.width,
+                height: swap.height,
+                texture_format,
+                color_space,
+                frame_index: swap.frame_index,
+                draws_total: gpu_inline.stats.draws_seen,
+                packets_total: gpu_inline.stats.packets_executed,
+                last_draw_prim: last_prim,
+                last_draw_vertex_count: last_verts,
+                indirect_buffer_jumps: gpu_inline.stats.indirect_buffer_jumps,
+                wait_reg_mem_blocks: gpu_inline.stats.wait_reg_mem_blocks,
+                instructions_total,
+                vs_blob_key: gpu_inline.active_vs_key.unwrap_or(0),
+                ps_blob_key: gpu_inline.active_ps_key.unwrap_or(0),
+                resolves_total: gpu_inline.stats.resolves_total,
+                resolves_copied_total: gpu_inline.stats.resolves_copied_total,
+                resolves_skipped_total: gpu_inline.stats.resolves_skipped_total,
+                unique_render_targets: gpu_inline.stats.unique_render_targets,
+                interrupts_delivered: state.interrupts.delivered,
+                interrupts_dropped: state.interrupts.dropped,
+            },
+            mem,
+        );
+    }
+    tracing::info!(
+        frame = swap.frame_index,
+        fb = format_args!("{:#010x}", swap.frontbuffer_phys),
+        width = swap.width,
+        height = swap.height,
+        fmt = texture_format,
+        cs = color_space,
+        drained,
+        buffer_ptr = format_args!("{buffer_ptr:#010x}"),
+        fetch_ptr = format_args!("{fetch_ptr:#010x}"),
+        "VdSwap complete"
+    );
     ctx.gpr[3] = 0;
 }
 
 // ===== Audio =====
 
-fn xaudio_register_render_driver(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn xaudio_register_render_driver(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
     let handle = state.alloc_handle();
     tracing::info!("XAudioRegisterRenderDriverClient: handle={:#x}", handle);
     // r3 = callback_ptr, r4 = driver_ptr -> write handle
     ctx.gpr[3] = 0;
 }
 
-fn xma_create_context(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn xma_create_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
     let handle = state.alloc_handle();
     tracing::info!("XMACreateContext: handle={:#x}", handle);
     ctx.gpr[3] = handle as u64;
@@ -734,19 +2563,1016 @@ fn xma_create_context(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut
 
 // ===== Xex =====
 
-fn xex_get_procedure_address(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xex_get_procedure_address(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // Mirrors xenia-canary XexGetProcedureAddress_entry
+    // (xboxkrnl_modules.cc:195): r3 = hmodule, r4 = ordinal,
+    // r5 = lpdword_t out_function_ptr. Returns NTSTATUS in r3; on success
+    // writes the resolved thunk address to *out_function_ptr.
+    let hmodule = ctx.gpr[3] as u32;
     let ordinal = ctx.gpr[4] as u32;
-    tracing::warn!("XexGetProcedureAddress: ordinal={:#x} not found", ordinal);
-    ctx.gpr[3] = 0xC000_0034; // STATUS_OBJECT_NAME_NOT_FOUND
+    let out_ptr = ctx.gpr[5] as u32;
+    if out_ptr != 0 {
+        mem.write_u32(out_ptr, 0);
+    }
+
+    let Some(module) = state.module_id_from_hmodule(hmodule) else {
+        tracing::warn!(
+            "XexGetProcedureAddress: unknown hmodule={:#x} ordinal={:#x}",
+            hmodule,
+            ordinal,
+        );
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        return;
+    };
+    match state.resolve_thunk(module, ordinal as u16) {
+        Some(addr) => {
+            if out_ptr != 0 {
+                mem.write_u32(out_ptr, addr);
+            }
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        None => {
+            tracing::warn!(
+                "XexGetProcedureAddress: ordinal {:#x} not registered for {:?}",
+                ordinal,
+                module,
+            );
+            // STATUS_DRIVER_ENTRYPOINT_NOT_FOUND == 0xC000_0034.
+            ctx.gpr[3] = STATUS_OBJECT_NAME_NOT_FOUND;
+        }
+    }
 }
 
 // ===== Exception handling =====
 
-fn c_specific_handler(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn c_specific_handler(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::warn!("__C_specific_handler called (exception handling stub)");
     ctx.gpr[3] = 1; // ExceptionContinueSearch
 }
 
+// ===== Synchronization (events / semaphores / waits) =====
+
+/// Is the handle currently signaled / acquirable? For events and semaphores
+/// this tests the counting state; for thread handles it's true once the
+/// thread has exited.
+pub(crate) fn handle_signaled(state: &KernelState, handle: u32) -> bool {
+    match state.objects.get(&handle) {
+        Some(KernelObject::Event { signaled, .. }) => *signaled,
+        Some(KernelObject::Timer { signaled, .. }) => *signaled,
+        Some(KernelObject::Semaphore { count, .. }) => *count > 0,
+        Some(KernelObject::Thread { exit_code, .. }) => exit_code.is_some(),
+        _ => false,
+    }
+}
+
+/// Refresh a PKEVENT/PKSEMAPHORE shadow from the guest's dispatcher
+/// struct. Handle-keyed Nt objects (small integer keys) are managed
+/// entirely by the kernel and don't need this — but pointer-keyed Ke
+/// shadows can desync when the guest signals the dispatcher via a direct
+/// memory write (e.g. Sylpheed's graphics-interrupt callback writes
+/// `SignalState = 1` into its user_data struct instead of going through
+/// `KeSetEvent`). Before a wait check, we re-load byte +4 and reconcile
+/// the shadow's `signaled` / `count` with guest memory so the wait
+/// reflects the current dispatcher state.
+///
+/// Without this, tid=5's render-dispatcher poll loop on the Sylpheed
+/// intro spun 4.5M times per 100M instructions with only 11K resolved
+/// wakes — the callback was firing but the shadow stayed unsignaled,
+/// so every wait deadlined to `STATUS_TIMEOUT` and the worker looped
+/// without ever running its real render path.
+fn refresh_pkevent_shadow_from_guest(state: &mut KernelState, mem: &GuestMemory, ptr: u32) {
+    if ptr < 0x1_0000 {
+        return;
+    }
+    let Some(obj) = state.objects.get_mut(&ptr) else {
+        return;
+    };
+    let signal_state = mem.read_u32(ptr + 4);
+    match obj {
+        KernelObject::Event { signaled, .. } | KernelObject::Timer { signaled, .. } => {
+            if signal_state != 0 {
+                *signaled = true;
+            }
+            // Intentionally only pull the rising edge from guest
+            // memory. If the guest wrote 0 but the shadow says
+            // signaled=true because a `KeSetEvent` hasn't been
+            // consumed yet, we'd spuriously clear; leave clearing
+            // to `KeResetEvent` / auto-reset `handle_consume`.
+        }
+        KernelObject::Semaphore { count, .. } => {
+            let guest_count = signal_state as i32;
+            if guest_count > *count {
+                *count = guest_count;
+            }
+        }
+        _ => {}
+    }
+}
+
+/// Consume one signal slot on a handle (auto-reset events, semaphore
+/// decrement, mutex-ish). Assumes `handle_signaled` just returned true.
+pub(crate) fn handle_consume(state: &mut KernelState, handle: u32) {
+    match state.objects.get_mut(&handle) {
+        Some(KernelObject::Event {
+            manual_reset,
+            signaled,
+            ..
+        })
+        | Some(KernelObject::Timer {
+            manual_reset,
+            signaled,
+            ..
+        }) => {
+            if !*manual_reset {
+                *signaled = false;
+            }
+        }
+        Some(KernelObject::Semaphore { count, .. }) => {
+            if *count > 0 {
+                *count -= 1;
+            }
+        }
+        _ => {}
+    }
+}
+
+/// Register a guest thread as a waiter on a handle (for later wake).
+pub(crate) fn handle_enqueue_waiter(state: &mut KernelState, handle: u32, r: ThreadRef) {
+    match state.objects.get_mut(&handle) {
+        Some(KernelObject::Event { waiters, .. })
+        | Some(KernelObject::Semaphore { waiters, .. })
+        | Some(KernelObject::Thread { waiters, .. })
+        | Some(KernelObject::Timer { waiters, .. })
+        | Some(KernelObject::Mutex { waiters, .. }) => {
+            if !waiters.contains(&r) {
+                waiters.push(r);
+            }
+        }
+        _ => {}
+    }
+}
+
+/// Remove a ThreadRef from every waiter list it might be on. Called on wake
+/// so a thread woken on one of its WaitAny handles doesn't linger as a
+/// waiter on the others.
+pub(crate) fn handle_remove_waiter_everywhere(state: &mut KernelState, r: ThreadRef) {
+    for obj in state.objects.values_mut() {
+        if let Some(waiters) = obj.waiters_mut() {
+            waiters.retain(|&w| w != r);
+        }
+    }
+    for list in state.cs_waiters.values_mut() {
+        list.retain(|&w| w != r);
+    }
+}
+
+/// Parse a PowerPC-style LARGE_INTEGER timeout pointer.
+/// Returns `None` for "wait forever" (null pointer), `Some(0)` for
+/// "poll / don't block" (timeout value 0), else `Some(abs_deadline)`.
+/// Xbox 360 timeouts are signed 100-ns units; negative = relative.
+/// We convert to an absolute deadline on the current thread's timebase.
+pub(crate) fn parse_timeout(state: &KernelState, timeout_ptr: u32, mem: &GuestMemory) -> Option<Option<u64>> {
+    if timeout_ptr == 0 {
+        return Some(None); // wait infinitely
+    }
+    let hi = mem.read_u32(timeout_ptr) as i32;
+    let lo = mem.read_u32(timeout_ptr + 4);
+    let raw = ((hi as i64) << 32) | (lo as i64 & 0xFFFF_FFFF);
+    if raw == 0 {
+        return Some(Some(0)); // poll
+    }
+    let hw_id = state.scheduler.current_hw_id().unwrap_or(0);
+    let now = state.scheduler.ctx(hw_id).timebase;
+    // Negative = relative, positive = absolute wall-clock. Our timebase is a
+    // plain instruction counter, so we treat all timeouts as "time-units
+    // after now" regardless of sign, using the magnitude.
+    let magnitude = raw.unsigned_abs();
+    // Scale: 100-ns units → ~1 tick per ns is fine for emulation (games just
+    // want monotonic progress). Divide by 100 so multi-millisecond timeouts
+    // don't exceed u64 and wake quickly.
+    let deadline = now.saturating_add(magnitude.max(1) / 100);
+    Some(Some(deadline))
+}
+
+/// Resolve NT pseudo-handles to real kernel handles, matching Canary's
+/// [`ObjectTable::TranslateHandle`](https://github.com/xenia-canary/xenia-canary/blob/canary/src/xenia/kernel/util/object_table.cc):
+///
+/// * `0xFFFFFFFE` — `NtCurrentThread()` → the currently running thread's handle
+/// * `0xFFFFFFFF` — `NtCurrentProcess()` → 0 (not meaningful in our HLE)
+/// * anything else passes through untouched
+///
+/// Every kernel function that accepts a handle argument should translate
+/// first. Canary does this centrally in `LookupObject` — we don't have the
+/// same chokepoint, so the pattern is "call this at the top of each Ob/Ke/Nt
+/// entry point that consumes a handle".
+///
+/// Without this, Sylpheed's worker-thread prologue calls
+/// `ObReferenceObjectByHandle((HANDLE)-2, ...)` (= "get my own thread"),
+/// gets `STATUS_INVALID_HANDLE`, and proceeds with a null "thread object
+/// pointer" through `KeSetAffinityThread` — the worker then exits without
+/// running its real body, leaving the main thread parked forever on the
+/// completion event.
+fn resolve_pseudo_handle(state: &KernelState, handle: u32) -> u32 {
+    match handle {
+        0xFFFF_FFFF => 0,
+        0xFFFF_FFFE => {
+            let hw_id = state.scheduler.current_hw_id().unwrap_or(0);
+            state.scheduler.thread_handle(hw_id).unwrap_or(0)
+        }
+        h => h,
+    }
+}
+
+/// Lazily register a shadow kernel object for a guest `PKEVENT` / `PKSEMAPHORE`
+/// pointer on first touch from a `Ke*` sync function.
+///
+/// Background: on Xenon the `Nt*` family takes `HANDLE` integers (allocated
+/// by us via `alloc_handle`), but the `Ke*` family takes pointers to
+/// dispatcher structs in guest memory. `KeInitializeEvent` is an inline
+/// helper baked into the game's code — it writes the DISPATCHER_HEADER in
+/// place and we never see the call. As a result, when the game later calls
+/// e.g. `KeSetEvent(&kevent)`, our handle-lookup misses and the operation
+/// silently no-ops, leaving waiters parked forever. That was the root cause
+/// of Sylpheed's 562K/50M `KeResetEvent` poll-loop on pointer `0x42450b5c`.
+///
+/// We mint a shadow [`KernelObject`] in `state.objects` keyed by the guest
+/// pointer (pointers live above the handle range — `next_handle` starts at
+/// `0x1000` and bumps by 4, so collisions with a real handle are impossible
+/// for any sane pointer). Subsequent Ke/Nt operations hit the shadow.
+///
+/// Xenon DISPATCHER_HEADER layout (big-endian):
+///   +0  Type         (u8)  0=NotificationEvent, 1=SynchronizationEvent,
+///                          5=Semaphore. Others unsupported (Mutant/Timer
+///                          paths fall back to the prior no-op behavior).
+///   +1  Absolute     (u8)
+///   +2  Size         (u8)  in u32 words
+///   +3  Inserted     (u8)
+///   +4  SignalState  (i32)
+///   +8  WaitListHead (2 × u32)  LIST_ENTRY
+/// For KSEMAPHORE, `Limit` (i32) follows at +0x10.
+///
+/// Caveat: the shadow is authoritative once created. If the guest writes
+/// directly into the dispatcher struct bypassing the kernel API, the shadow
+/// drifts — but well-behaved NT code never does that.
+fn ensure_dispatcher_object(state: &mut KernelState, mem: &GuestMemory, ptr: u32) {
+    // Pointer-vs-handle discriminator: our handles are small (<= low
+    // tens of thousands for any realistic session). Anything higher is
+    // almost certainly a guest pointer. Also bail if already registered.
+    if ptr < 0x1_0000 || state.objects.contains_key(&ptr) {
+        return;
+    }
+    let ty = mem.read_u8(ptr);
+    let signal_state = mem.read_u32(ptr + 4);
+    let obj = match ty {
+        0 => KernelObject::Event {
+            manual_reset: true,
+            signaled: signal_state != 0,
+            waiters: Vec::new(),
+        },
+        1 => KernelObject::Event {
+            manual_reset: false,
+            signaled: signal_state != 0,
+            waiters: Vec::new(),
+        },
+        5 => {
+            let limit = mem.read_u32(ptr + 0x10) as i32;
+            KernelObject::Semaphore {
+                count: signal_state as i32,
+                max: limit.max(1),
+                waiters: Vec::new(),
+            }
+        }
+        // KTIMER DISPATCHER_HEADER: type=8 NotificationTimer (manual-reset),
+        // type=9 SynchronizationTimer (auto-reset). Mint a disarmed shadow —
+        // deadline/period live in KTIMER's extended fields (+0x20 onward)
+        // which we don't mirror; games that want the timer armed go through
+        // NtSetTimerEx / KeSetTimer (handle-based), and Sylpheed uses the
+        // handle path exclusively.
+        8 | 9 => KernelObject::Timer {
+            manual_reset: ty == 8,
+            signaled: signal_state != 0,
+            deadline: None,
+            period_ticks: 0,
+            period_ms: 0,
+            callback_routine: 0,
+            callback_arg: 0,
+            waiters: Vec::new(),
+        },
+        _ => return,
+    };
+    state.objects.insert(ptr, obj);
+}
+
+/// Set `gpr[3]` on a just-woken HW thread to reflect which handle in its
+/// wait set was the one that fired. Canary's `WaitMultiple` returns
+/// `STATUS_WAIT_0 + index` on WaitAny success; games branch on it. The
+/// default pre-populated status is `STATUS_SUCCESS` (== WAIT_0), which only
+/// matches when the first handle is the signaling one — anything else
+/// looks like a spurious index-0 wake to the caller.
+fn set_wake_status_for_waitany(state: &mut KernelState, r: ThreadRef, signaled_handle: u32) {
+    use xenia_cpu::scheduler::{BlockReason, HwState};
+    let Some(t) = state.scheduler.try_thread_mut(r) else {
+        return;
+    };
+    let idx = match &t.state {
+        HwState::Blocked(BlockReason::WaitAny { handles, .. })
+        | HwState::ServicingIrq(BlockReason::WaitAny { handles, .. }) => {
+            handles.iter().position(|&h| h == signaled_handle)
+        }
+        _ => None,
+    };
+    if let Some(i) = idx {
+        t.ctx.gpr[3] = i as u64;
+    }
+}
+
+/// Wake all waiters whose predicate now holds on the given handle (manual
+/// reset fans out; auto-reset/semaphore wakes one and consumes).
+pub(crate) fn wake_eligible_waiters(state: &mut KernelState, handle: u32) {
+    loop {
+        let Some(obj) = state.objects.get_mut(&handle) else {
+            return;
+        };
+        let (manual_reset, should_signal, consume) = match obj {
+            KernelObject::Event {
+                manual_reset,
+                signaled,
+                waiters,
+            }
+            | KernelObject::Timer {
+                manual_reset,
+                signaled,
+                waiters,
+                ..
+            } => {
+                if *signaled && !waiters.is_empty() {
+                    (*manual_reset, true, !*manual_reset)
+                } else {
+                    return;
+                }
+            }
+            KernelObject::Semaphore {
+                count, waiters, ..
+            } => {
+                if *count > 0 && !waiters.is_empty() {
+                    (false, true, true)
+                } else {
+                    return;
+                }
+            }
+            KernelObject::Thread {
+                exit_code, waiters, ..
+            } => {
+                if exit_code.is_some() && !waiters.is_empty() {
+                    (true, true, false)
+                } else {
+                    return;
+                }
+            }
+            _ => return,
+        };
+        if !should_signal {
+            return;
+        }
+        let winner = match obj {
+            KernelObject::Event { waiters, .. }
+            | KernelObject::Timer { waiters, .. }
+            | KernelObject::Semaphore { waiters, .. }
+            | KernelObject::Thread { waiters, .. } => {
+                if manual_reset {
+                    // Take the whole queue at once; manual-reset fires once
+                    // and stays signaled so every parked waiter clears.
+                    let list = std::mem::take(waiters);
+                    for w in list {
+                        set_wake_status_for_waitany(state, w, handle);
+                        state.scheduler.wake_ref(w);
+                        handle_remove_waiter_everywhere(state, w);
+                        // scheduler.wake_ref also loses timed-waits entry
+                        if state.audit.enabled {
+                            // Record one wake per thread woken. `aux` carries
+                            // the resolved status (gpr[3]) we just set.
+                            let status = state.scheduler.thread(w).ctx.gpr[3];
+                            state.audit_wake(handle, 0, "wake_eligible_waiters/manual", status);
+                        }
+                    }
+                    return;
+                } else {
+                    waiters.remove(0)
+                }
+            }
+            _ => return,
+        };
+        if consume {
+            handle_consume(state, handle);
+        }
+        set_wake_status_for_waitany(state, winner, handle);
+        state.scheduler.wake_ref(winner);
+        handle_remove_waiter_everywhere(state, winner);
+        if state.audit.enabled {
+            let status = state.scheduler.thread(winner).ctx.gpr[3];
+            state.audit_wake(handle, 0, "wake_eligible_waiters/auto", status);
+        }
+        // continue loop for semaphores that may wake more
+    }
+}
+
+fn ke_set_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = PKEVENT on Ke* (guest pointer). See `ensure_dispatcher_object`
+    // for why we need the lazy-shadow step here.
+    let h = ctx.gpr[3] as u32;
+    ensure_dispatcher_object(state, mem, h);
+    let previous = match state.objects.get_mut(&h) {
+        Some(KernelObject::Event { signaled, .. }) => {
+            let prev = *signaled;
+            *signaled = true;
+            prev as u32
+        }
+        _ => 0,
+    };
+    state.audit_signal(h, ctx.lr as u32, "KeSetEvent", previous as u64);
+    wake_eligible_waiters(state, h);
+    ctx.gpr[3] = previous as u64;
+}
+
+fn ke_reset_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    let h = ctx.gpr[3] as u32;
+    ensure_dispatcher_object(state, mem, h);
+    let previous = match state.objects.get_mut(&h) {
+        Some(KernelObject::Event { signaled, .. }) => {
+            let prev = *signaled;
+            *signaled = false;
+            prev as u32
+        }
+        _ => 0,
+    };
+    ctx.gpr[3] = previous as u64;
+}
+
+fn nt_set_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    let handle = ctx.gpr[3] as u32;
+    let prev_ptr = ctx.gpr[4] as u32;
+    let previous = match state.objects.get_mut(&handle) {
+        Some(KernelObject::Event { signaled, .. }) => {
+            let prev = *signaled;
+            *signaled = true;
+            prev as u32
+        }
+        _ => 0,
+    };
+    state.audit_signal(handle, ctx.lr as u32, "NtSetEvent", previous as u64);
+    wake_eligible_waiters(state, handle);
+    if prev_ptr != 0 {
+        mem.write_u32(prev_ptr, previous);
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+fn nt_clear_event(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
+    let handle = ctx.gpr[3] as u32;
+    if let Some(KernelObject::Event { signaled, .. }) = state.objects.get_mut(&handle) {
+        *signaled = false;
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+/// Pulse an event: wake current waiters as if signaled, then leave the event
+/// in the non-signaled state. For manual-reset events this wakes *all*
+/// parked waiters at once; for auto-reset events it wakes at most one (the
+/// first in the FIFO) and implicitly consumes the pulse.
+///
+/// Canary impl: [xboxkrnl_threading.cc::KePulseEvent_entry](xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc)
+/// → [xevent.cc::XEvent::Pulse](xenia-canary/src/xenia/kernel/xevent.cc).
+fn pulse_event_on_object(state: &mut KernelState, key: u32) -> u32 {
+    // Capture previous state; then temporarily mark the event signaled so
+    // `wake_eligible_waiters` does the right wake-all vs wake-one split.
+    let previous = match state.objects.get_mut(&key) {
+        Some(KernelObject::Event { signaled, .. }) => {
+            let prev = *signaled;
+            *signaled = true;
+            prev as u32
+        }
+        _ => return 0,
+    };
+    wake_eligible_waiters(state, key);
+    // Pulse leaves the event non-signaled regardless of type — manual-reset
+    // would otherwise stay latched after `wake_eligible_waiters`, and auto-
+    // reset with no waiters would linger signaled until the first wait.
+    if let Some(KernelObject::Event { signaled, .. }) = state.objects.get_mut(&key) {
+        *signaled = false;
+    }
+    previous
+}
+
+fn ke_pulse_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = PKEVENT (guest pointer), r4 = increment, r5 = wait (ignored).
+    let h = ctx.gpr[3] as u32;
+    ensure_dispatcher_object(state, mem, h);
+    let previous = pulse_event_on_object(state, h);
+    state.audit_signal(h, ctx.lr as u32, "KePulseEvent", previous as u64);
+    ctx.gpr[3] = previous as u64;
+}
+
+fn nt_pulse_event(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = previous_state_ptr (optional).
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let prev_ptr = ctx.gpr[4] as u32;
+    if !state.objects.contains_key(&handle) {
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        return;
+    }
+    let previous = pulse_event_on_object(state, handle);
+    state.audit_signal(handle, ctx.lr as u32, "NtPulseEvent", previous as u64);
+    if prev_ptr != 0 {
+        mem.write_u32(prev_ptr, previous);
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+/// Attempt `*count += adjust` with the cap at `max`. Returns `(previous,
+/// updated)` where `updated == false` means the adjustment would have
+/// exceeded `max` (or overflowed `i32`) and the count was left untouched.
+fn try_release_semaphore(count: &mut i32, max: i32, adjust: i32) -> (i32, bool) {
+    let prev = *count;
+    match count.checked_add(adjust) {
+        Some(new) if new <= max => {
+            *count = new;
+            (prev, true)
+        }
+        _ => (prev, false),
+    }
+}
+
+fn ke_release_semaphore(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = PKSEMAPHORE, r4 = adjustment. Ke-form returns the previous
+    // count directly (never a status); if the release would exceed
+    // `Limit` the count silently stays put — Canary `xeKeReleaseSemaphore`
+    // at xboxkrnl_threading.cc:707-722 marks the success return of
+    // `ReleaseSemaphore` `[[maybe_unused]]`.
+    let h = ctx.gpr[3] as u32;
+    ensure_dispatcher_object(state, mem, h);
+    let adjust = ctx.gpr[4] as i32;
+    let previous = match state.objects.get_mut(&h) {
+        Some(KernelObject::Semaphore { count, max, .. }) => {
+            let (prev, _updated) = try_release_semaphore(count, *max, adjust);
+            prev
+        }
+        _ => 0,
+    };
+    state.audit_signal(h, ctx.lr as u32, "KeReleaseSemaphore", previous as u64);
+    wake_eligible_waiters(state, h);
+    ctx.gpr[3] = previous as u64;
+}
+
+fn nt_release_semaphore(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = release_count, r5 = previous_count* (optional).
+    // Canary `NtReleaseSemaphore_entry` (xboxkrnl_threading.cc:771-797)
+    // returns `X_STATUS_SEMAPHORE_LIMIT_EXCEEDED` (0xC000_0047) when the
+    // post-release count would exceed `Limit`, AND does NOT update the
+    // count in that case. `previous_count` is written regardless.
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let release = ctx.gpr[4] as i32;
+    let prev_ptr = ctx.gpr[5] as u32;
+    let (previous, status) = match state.objects.get_mut(&handle) {
+        Some(KernelObject::Semaphore { count, max, .. }) => {
+            let (prev, updated) = try_release_semaphore(count, *max, release);
+            if updated {
+                (prev, STATUS_SUCCESS)
+            } else {
+                (prev, STATUS_SEMAPHORE_LIMIT_EXCEEDED)
+            }
+        }
+        Some(_) | None => {
+            ctx.gpr[3] = STATUS_INVALID_HANDLE;
+            return;
+        }
+    };
+    state.audit_signal(handle, ctx.lr as u32, "NtReleaseSemaphore", previous as u64);
+    if status == STATUS_SUCCESS {
+        wake_eligible_waiters(state, handle);
+    }
+    if prev_ptr != 0 {
+        mem.write_u32(prev_ptr, previous as u32);
+    }
+    ctx.gpr[3] = status;
+}
+
+/// Single-handle wait with timeout. If the handle is already signaled, consume
+/// and return success. Otherwise park the current HW thread and set ctx.gpr[3]
+/// to STATUS_SUCCESS — when a waker arrives the thread resumes at its caller's
+/// return address with success already in r3. Timeout=0 never parks.
+fn do_wait_single(ctx: &mut PpcContext, state: &mut KernelState, handle: u32, timeout_ptr: u32, mem: &GuestMemory) {
+    state.audit_wait(handle, ctx.lr as u32, "do_wait_single", 0);
+    if handle_signaled(state, handle) {
+        handle_consume(state, handle);
+        ctx.gpr[3] = STATUS_SUCCESS;
+        return;
+    }
+    let deadline_opt = parse_timeout(state, timeout_ptr, mem);
+    let deadline = match deadline_opt {
+        Some(Some(0)) => {
+            ctx.gpr[3] = STATUS_TIMEOUT;
+            return;
+        }
+        Some(Some(d)) => Some(d),
+        Some(None) => None,
+        None => None,
+    };
+    let current_ref = state.scheduler.current_ref();
+    handle_enqueue_waiter(state, handle, current_ref);
+    tracing::debug!(
+        "wait_single: hw={} handle={:#x} park{}",
+        current_ref.hw_id,
+        handle,
+        match deadline {
+            Some(d) => format!(" until_tick={}", d),
+            None => " forever".into(),
+        }
+    );
+    // Pre-populate the return code — most wakes resolve as STATUS_SUCCESS;
+    // timeouts overwrite via the scheduler's deadline-wake path.
+    ctx.gpr[3] = STATUS_SUCCESS;
+    state.scheduler.park_current(BlockReason::WaitAny {
+        handles: vec![handle],
+        deadline,
+    });
+}
+
+/// Multi-handle wait. `wait_type` 0 = WaitAll, 1 = WaitAny (NT convention).
+fn do_wait_multiple(
+    ctx: &mut PpcContext,
+    state: &mut KernelState,
+    handles: Vec<u32>,
+    wait_all: bool,
+    timeout_ptr: u32,
+    mem: &GuestMemory,
+) {
+    if state.audit.enabled {
+        // Pack (wait_all flag) | (handle_count << 1) into aux for the trail.
+        let aux = (wait_all as u64) | ((handles.len() as u64) << 1);
+        for &h in &handles {
+            state.audit_wait(h, ctx.lr as u32, "do_wait_multiple", aux);
+        }
+    }
+    let already_ok = if wait_all {
+        handles.iter().all(|&h| handle_signaled(state, h))
+    } else {
+        handles.iter().any(|&h| handle_signaled(state, h))
+    };
+    if already_ok {
+        // Canary's `XObject::WaitMultiple` returns the **index** of the
+        // first-signaled handle for WaitAny (`STATUS_WAIT_0 + n`), not
+        // plain `STATUS_SUCCESS`. `STATUS_WAIT_0` is numerically 0, so
+        // index 0 still looks like success, but index 1+ matters: games
+        // commonly dispatch on the index. Sylpheed's worker prologue does
+        // `wait_any([start_event, work_sem])` and branches on the result:
+        // 0 means "start-event fired" (cleanup/exit), 1 means "sem fired"
+        // (run user proc then signal completion). Returning 0 for a sem
+        // wake made the worker always take the cleanup branch and exit
+        // without ever signaling the completion event.
+        if wait_all {
+            for &h in &handles {
+                handle_consume(state, h);
+            }
+            ctx.gpr[3] = STATUS_SUCCESS;
+        } else if let Some((idx, &h)) = handles
+            .iter()
+            .enumerate()
+            .find(|&(_, &h)| handle_signaled(state, h))
+        {
+            handle_consume(state, h);
+            ctx.gpr[3] = idx as u64; // STATUS_WAIT_0 + idx
+        } else {
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        return;
+    }
+    let deadline_opt = parse_timeout(state, timeout_ptr, mem);
+    let deadline = match deadline_opt {
+        Some(Some(0)) => {
+            ctx.gpr[3] = STATUS_TIMEOUT;
+            return;
+        }
+        Some(Some(d)) => Some(d),
+        Some(None) => None,
+        None => None,
+    };
+    let current_ref = state.scheduler.current_ref();
+    for &h in &handles {
+        handle_enqueue_waiter(state, h, current_ref);
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+    let reason = if wait_all {
+        BlockReason::WaitAll {
+            handles: handles.clone(),
+            deadline,
+        }
+    } else {
+        BlockReason::WaitAny { handles, deadline }
+    };
+    state.scheduler.park_current(reason);
+}
+
+fn nt_wait_for_single_object_ex(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = handle, r4 = wait_mode, r5 = alertable, r6 = timeout_ptr
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let timeout_ptr = ctx.gpr[6] as u32;
+    do_wait_single(ctx, state, handle, timeout_ptr, mem);
+}
+
+/// `NtSignalAndWaitForSingleObjectEx(signal_handle, wait_handle, wait_mode,
+/// alertable, timeout_ptr)` — atomically signal one kernel object and wait on
+/// another. Matches Canary's `NtSignalAndWaitForSingleObjectEx_entry`
+/// (xboxkrnl_threading.cc:1103). Common producer/consumer handshake primitive:
+/// producer calls `NSAWFSO(work_done, work_free)` so the consumer's wait
+/// resolves at the same instant the producer starts waiting for the next
+/// bucket.
+///
+/// Before this export existed games that relied on the primitive saw the
+/// call surface as `unimplemented kernel export`, their threads proceeded
+/// without the signal being fired, and the paired consumer-thread wait
+/// would block indefinitely. Sylpheed's I/O dispatcher uses this for its
+/// async file-query completion signaling.
+fn nt_signal_and_wait_for_single_object_ex(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = signal_handle, r4 = wait_handle, r5 = wait_mode, r6 = alertable, r7 = timeout_ptr
+    let signal_handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let wait_handle = resolve_pseudo_handle(state, ctx.gpr[4] as u32);
+    let timeout_ptr = ctx.gpr[7] as u32;
+
+    // Signal phase — mirror `nt_set_event` for Event handles; if the
+    // handle is unknown we return `STATUS_INVALID_HANDLE` without waiting,
+    // matching Canary's "lookup both, fail fast if either missing" guard.
+    let signal_prev: u64 = match state.objects.get_mut(&signal_handle) {
+        Some(KernelObject::Event { signaled, .. }) => {
+            let was = *signaled;
+            *signaled = true;
+            was as u64
+        }
+        Some(KernelObject::Semaphore { count, .. }) => {
+            let was = *count as u64;
+            *count = count.saturating_add(1);
+            was
+        }
+        _ => {
+            ctx.gpr[3] = STATUS_INVALID_HANDLE;
+            return;
+        }
+    };
+    state.audit_signal(
+        signal_handle,
+        ctx.lr as u32,
+        "NtSignalAndWaitForSingleObjectEx",
+        signal_prev,
+    );
+    wake_eligible_waiters(state, signal_handle);
+
+    // Then fall into the normal single-wait path on wait_handle.
+    do_wait_single(ctx, state, wait_handle, timeout_ptr, mem);
+}
+
+fn ke_wait_for_single_object(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = PKEVENT (guest pointer), r4 = wait_reason, r5 = wait_mode,
+    // r6 = alertable, r7 = timeout_ptr
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    ensure_dispatcher_object(state, mem, handle);
+    refresh_pkevent_shadow_from_guest(state, mem, handle);
+    let timeout_ptr = ctx.gpr[7] as u32;
+    do_wait_single(ctx, state, handle, timeout_ptr, mem);
+}
+
+fn nt_wait_for_multiple_objects_ex(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = count, r4 = handles_ptr, r5 = wait_type (0=All, 1=Any),
+    // r6 = wait_mode, r7 = alertable, r8 = timeout_ptr
+    let count = ctx.gpr[3] as u32;
+    let handles_ptr = ctx.gpr[4] as u32;
+    let wait_type = ctx.gpr[5] as u32;
+    let timeout_ptr = ctx.gpr[8] as u32;
+    let handles: Vec<u32> = (0..count)
+        .map(|i| resolve_pseudo_handle(state, mem.read_u32(handles_ptr + i * 4)))
+        .collect();
+    let wait_all = wait_type == 0;
+    do_wait_multiple(ctx, state, handles, wait_all, timeout_ptr, mem);
+}
+
+fn ke_wait_for_multiple_objects(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = count, r4 = objects_ptr (array of PKEVENT/PKSEMAPHORE pointers),
+    // r5 = wait_type, r6 = wait_reason, r7 = wait_mode, r8 = alertable,
+    // r9 = timeout_ptr, r10 = wait_blocks (ignored)
+    let count = ctx.gpr[3] as u32;
+    let handles_ptr = ctx.gpr[4] as u32;
+    let wait_type = ctx.gpr[5] as u32;
+    let timeout_ptr = ctx.gpr[9] as u32;
+    let handles: Vec<u32> = (0..count)
+        .map(|i| resolve_pseudo_handle(state, mem.read_u32(handles_ptr + i * 4)))
+        .collect();
+    for &h in &handles {
+        ensure_dispatcher_object(state, mem, h);
+        refresh_pkevent_shadow_from_guest(state, mem, h);
+    }
+    let wait_all = wait_type == 0;
+    do_wait_multiple(ctx, state, handles, wait_all, timeout_ptr, mem);
+}
+
+fn ke_delay_execution_thread(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = wait_mode, r4 = alertable, r5 = interval_ptr (LARGE_INTEGER 100-ns)
+    let interval_ptr = ctx.gpr[5] as u32;
+    let deadline_opt = parse_timeout(state, interval_ptr, mem);
+    let deadline = match deadline_opt {
+        Some(Some(0)) => {
+            // Yield-like — return immediately.
+            ctx.gpr[3] = STATUS_SUCCESS;
+            return;
+        }
+        Some(Some(d)) => d,
+        Some(None) => u64::MAX, // KeDelayExecution with NULL interval = sleep forever (unusual)
+        None => u64::MAX,
+    };
+    ctx.gpr[3] = STATUS_SUCCESS;
+    state
+        .scheduler
+        .park_current(BlockReason::DelayUntil(deadline));
+}
+
+fn nt_yield_execution(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
+    // The next round of the scheduler already hands control to another HW
+    // thread, so we don't need to park. Just return success.
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+fn ke_resume_thread(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = thread_ptr (KTHREAD). We don't track KTHREAD ↔ HW mapping through
+    // guest memory addresses, so accept and succeed. Real NtResumeThread
+    // below handles the handle-based path properly.
+    ctx.gpr[3] = 0;
+    let _ = state;
+}
+
+fn nt_resume_thread(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = prev_suspend_count_ptr
+    let handle = ctx.gpr[3] as u32;
+    let prev_ptr = ctx.gpr[4] as u32;
+    let prev = state
+        .scheduler
+        .find_by_handle(handle)
+        .map(|r| state.scheduler.resume_ref(r))
+        .unwrap_or(0);
+    if prev_ptr != 0 {
+        mem.write_u32(prev_ptr, prev);
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+fn nt_suspend_thread(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = prev_suspend_count_ptr
+    let handle = ctx.gpr[3] as u32;
+    let prev_ptr = ctx.gpr[4] as u32;
+    let prev = state
+        .scheduler
+        .find_by_handle(handle)
+        .map(|r| state.scheduler.suspend_ref(r))
+        .unwrap_or(0);
+    if prev_ptr != 0 {
+        mem.write_u32(prev_ptr, prev);
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+// ===== Object & module lookup =====
+
+fn xex_get_module_handle(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // Mirrors xenia-canary XexGetModuleHandle_entry
+    // (xboxkrnl_modules.cc:42): r3 = lpstring_t module_name,
+    // r4 = lpdword_t hmodule_ptr. Returns NTSTATUS in r3; writes the
+    // resolved handle to *hmodule_ptr. `X_ERROR_NOT_FOUND` for unknown
+    // names. Distinct pseudo-handles for kernel modules so a follow-up
+    // `XexGetProcedureAddress` can route to the right ordinal table.
+    let name_ptr = ctx.gpr[3] as u32;
+    let out_ptr = ctx.gpr[4] as u32;
+    if out_ptr != 0 {
+        mem.write_u32(out_ptr, 0);
+    }
+
+    let resolved: Option<u32> = if name_ptr == 0 {
+        Some(state.image_base)
+    } else {
+        let name = read_cstring(mem, name_ptr);
+        if name.is_empty() || name.eq_ignore_ascii_case("default.xex") {
+            Some(state.image_base)
+        } else if name.eq_ignore_ascii_case("xboxkrnl.exe") {
+            Some(crate::state::HMODULE_XBOXKRNL)
+        } else if name.eq_ignore_ascii_case("xam.xex") {
+            Some(crate::state::HMODULE_XAM)
+        } else {
+            None
+        }
+    };
+
+    match resolved {
+        Some(h) => {
+            if out_ptr != 0 {
+                mem.write_u32(out_ptr, h);
+            }
+            ctx.gpr[3] = STATUS_SUCCESS;
+        }
+        None => ctx.gpr[3] = X_ERROR_NOT_FOUND,
+    }
+}
+
+/// `NtDuplicateObject(handle, new_handle_ptr, options)` — per Canary's
+/// `NtDuplicateObject_entry`:
+///   * r3 = source handle (pseudo-handles like `(HANDLE)-2` are common — the
+///     Canary comment explicitly notes "this function seems to be used to get
+///     the current thread handle")
+///   * r4 = new_handle_ptr (if zero, the call is actually a close)
+///   * r5 = options (bit 0 = DUPLICATE_CLOSE_SOURCE)
+///
+/// Canary allocates a fresh handle id that refcounts the same underlying
+/// `XObject`. We don't refcount, so we alias: write the *source* handle back
+/// as the "new" handle. The game then uses it interchangeably, and both ids
+/// resolve to the same `KernelObject` entry.
+///
+/// A prior `stub_success` left `*new_handle_ptr` uninitialized — Sylpheed's
+/// thread-dispatch prologue does `NtDuplicateObject(event, &dup)` then passes
+/// `dup` to the worker, and the worker does `NtSetEvent(dup)` to signal
+/// completion. With the stub, `dup` was stack garbage → set-event lookup
+/// failed silently → main thread blocked forever on the source event.
+fn nt_duplicate_object(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    let source = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let out_ptr = ctx.gpr[4] as u32;
+    let options = ctx.gpr[5] as u32;
+
+    if !state.objects.contains_key(&source) {
+        if out_ptr != 0 {
+            mem.write_u32(out_ptr, 0);
+        }
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        return;
+    }
+    if out_ptr != 0 {
+        mem.write_u32(out_ptr, source);
+    }
+    // Aliased-handle refcount: since we return the source handle as the "new"
+    // handle (no fresh id), every duplicate must bump the per-handle refcount
+    // so the later `NtClose` pair (one for source, one for dup) doesn't
+    // destroy the object mid-flight. `DUPLICATE_CLOSE_SOURCE` (bit 0) closes
+    // the source in Canary (xboxkrnl_ob.cc:389), so in our aliased model the
+    // source-close cancels the dup-gain: net refcount is unchanged. Without
+    // `CLOSE_SOURCE`, both the source and the dup are separately live and we
+    // need +1.
+    const DUPLICATE_CLOSE_SOURCE: u32 = 0x0000_0001;
+    if options & DUPLICATE_CLOSE_SOURCE == 0
+        && let Some(c) = state.handle_refcount.get_mut(&source)
+    {
+        *c += 1;
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
+fn ob_reference_object_by_handle(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = handle, r4 = object_type, r5 = out_object_ptr
+    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let out_ptr = ctx.gpr[5] as u32;
+    if handle == 0 || !state.objects.contains_key(&handle) {
+        ctx.gpr[3] = STATUS_INVALID_HANDLE;
+        if out_ptr != 0 {
+            mem.write_u32(out_ptr, 0);
+        }
+        return;
+    }
+    if out_ptr != 0 {
+        // We don't maintain real KTHREAD/KEVENT structs in guest memory, so
+        // pass back the handle as a stable cookie — downstream Ke* calls
+        // that take a "thread pointer" (e.g. KeSetAffinityThread) then look
+        // up the same handle via `state.objects`. Matches Canary semantics
+        // for our HLE without requiring a host-visible object-struct backing.
+        mem.write_u32(out_ptr, handle);
+    }
+    ctx.gpr[3] = STATUS_SUCCESS;
+}
+
 // ===== Helpers =====
 
 fn read_cstring(mem: &GuestMemory, addr: u32) -> String {
@@ -761,3 +3587,1525 @@ fn read_cstring(mem: &GuestMemory, addr: u32) -> String {
     }
     s
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+    use xenia_memory::page_table::MemoryProtect;
+
+    /// Scratch region the nt_read_file/nt_write_file tests write into
+    /// (iosb + buffer). A single committed page is plenty.
+    const SCRATCH_BASE: u32 = 0x4000_0000;
+
+    fn fresh() -> (PpcContext, GuestMemory, KernelState) {
+        let mut mem = GuestMemory::new().expect("memory init");
+        mem.alloc(SCRATCH_BASE, 0x1000, MemoryProtect::READ | MemoryProtect::WRITE)
+            .expect("scratch page must commit");
+        let mut state = KernelState::new();
+        // Under per-slot runqueues, most kernel exports reach through
+        // `scheduler.current` — tests that exercise those paths need a
+        // live thread installed on slot 0 first. Older tests (file I/O
+        // etc.) don't touch it and are unaffected.
+        state.install_initial_thread(
+            PpcContext::default(),
+            0x7000_0000,
+            0x10_0000,
+            SCRATCH_BASE + 0x800,
+            SCRATCH_BASE + 0xC00,
+            0x1000,
+            &mut mem,
+        );
+        state.scheduler.begin_slot_visit(0);
+        (PpcContext::default(), mem, state)
+    }
+
+    fn make_file(state: &mut KernelState, bytes: Vec<u8>) -> u32 {
+        let size = bytes.len() as u64;
+        state.alloc_handle_for(KernelObject::File {
+            path: "test.bin".to_string(),
+            size,
+            position: 0,
+            data: Arc::new(bytes),
+            dir_enum_pos: None,
+        })
+    }
+
+    fn make_event(state: &mut KernelState) -> u32 {
+        state.alloc_handle_for(KernelObject::Event {
+            manual_reset: true,
+            signaled: false,
+            waiters: Vec::new(),
+        })
+    }
+
+    fn event_signaled(state: &KernelState, h: u32) -> bool {
+        match state.objects.get(&h) {
+            Some(KernelObject::Event { signaled, .. }) => *signaled,
+            _ => panic!("expected Event at handle {:#x}", h),
+        }
+    }
+
+    /// Axis 4: `KeSetAffinityThread` actually migrates between slots
+    /// now. Spawn a secondary thread with affinity 0x02 (slot 1 only),
+    /// then call the export to move it to slot 4.
+    #[test]
+    fn ke_set_affinity_thread_migrates_and_returns_old() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Pre-fresh() set up the main thread on slot 0. Spawn a worker
+        // on slot 1 via ex_create_thread so the handle / PCR are real.
+        // Simpler: inject directly via scheduler.spawn.
+        use xenia_cpu::scheduler::SpawnParams;
+        let pcr_base = SCRATCH_BASE + 0x500;
+        mem.write_u32(pcr_base + 0x2C, 0xDEAD_BEEF); // sentinel
+        let params = SpawnParams {
+            entry: 0x8200_0000,
+            start_context: 0,
+            stack_base: 0x7200_0000,
+            stack_size: 0x10000,
+            pcr_base,
+            tls_base: 0,
+            thread_handle: 0x2000,
+            guest_tid: 42,
+            create_suspended: false,
+            is_initial: false,
+            tls_slot_count: 0,
+            affinity_mask: 0b0000_0010,
+            priority: 0,
+            ideal_processor: None,
+        };
+        state
+            .scheduler
+            .spawn(params, &mut crate::state::GuestMemoryPcr(&mut mem))
+            .unwrap();
+        // Confirm PCR was written by the spawn (sanity).
+        assert_eq!(mem.read_u32(pcr_base + 0x2C), 1);
+
+        // Now call KeSetAffinityThread(handle=0x2000, new_mask=0x20).
+        ctx.gpr[3] = 0x2000;
+        ctx.gpr[4] = 0x20; // slot 5 only
+        ke_set_affinity_thread(&mut ctx, &mut mem, &mut state);
+        // Return value = previous mask = 0x02.
+        assert_eq!(ctx.gpr[3], 0x02);
+        // PCR rewritten to 5.
+        assert_eq!(mem.read_u32(pcr_base + 0x2C), 5);
+        // Thread now on slot 5.
+        let r = state.scheduler.find_by_handle(0x2000).expect("still alive");
+        assert_eq!(r.hw_id, 5);
+    }
+
+    /// Axis 5: `KeSetIdealProcessor` stores a hint on the thread
+    /// without migrating it; query round-trips.
+    #[test]
+    fn ke_set_ideal_processor_round_trips() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Main thread handle is 0x1000.
+        ctx.gpr[3] = 0x1000;
+        ctx.gpr[4] = 3;
+        ke_set_ideal_processor(&mut ctx, &mut mem, &mut state);
+        // Prior was 0xFF (unset sentinel).
+        assert_eq!(ctx.gpr[3], 0xFF);
+        ctx.gpr[3] = 0x1000;
+        ke_query_ideal_processor(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 3);
+    }
+
+    /// Axis 5: `NtSetInformationThread` class `ThreadAffinityMask`
+    /// routes through `KernelState::set_affinity` and actually migrates.
+    #[test]
+    fn nt_set_information_thread_affinity_migrates() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Park info buffer in scratch.
+        let info_ptr = SCRATCH_BASE + 0x40;
+        mem.write_u32(info_ptr, 0x08); // mask = slot 3
+        ctx.gpr[3] = 0x1000; // main handle
+        ctx.gpr[4] = 3; // ThreadAffinityMask
+        ctx.gpr[5] = info_ptr as u64;
+        ctx.gpr[6] = 4; // info_len
+        nt_set_information_thread(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        // Main should have migrated to slot 3.
+        let r = state.scheduler.find_by_handle(0x1000).expect("still alive");
+        assert_eq!(r.hw_id, 3);
+    }
+
+    /// Priority wiring — `KeSetBasePriorityThread` stores on the
+    /// `GuestThread` and `KeQueryBasePriorityThread` reads it back.
+    #[test]
+    fn ke_set_base_priority_round_trips() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // fresh() installs the main thread with handle 0x1000.
+        // Query the current priority first — default 0.
+        ctx.gpr[3] = 0x1000;
+        ke_query_base_priority_thread(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0);
+        // Set priority to 7 (high-ish).
+        ctx.gpr[3] = 0x1000;
+        ctx.gpr[4] = 7u64;
+        ke_set_base_priority_thread(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0, "old priority was 0");
+        // Query again — now 7.
+        ctx.gpr[3] = 0x1000;
+        ke_query_base_priority_thread(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 7);
+    }
+
+    /// The regression we're guarding against: Sylpheed parks a thread on the
+    /// event it handed to `NtReadFile`. Historically our HLE ignored r4 and
+    /// left the event unsignaled — the wait never released. Completion must
+    /// signal the event regardless of whether the read succeeds.
+    #[test]
+    fn nt_read_file_signals_completion_event_on_success() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let file = make_file(&mut state, vec![0x11, 0x22, 0x33, 0x44]);
+        let evt = make_event(&mut state);
+        let iosb: u32 = 0x4000_0000;
+        let buf: u32 = 0x4000_0100;
+        // r3 = file, r4 = event, r7 = iosb, r8 = buf, r9 = len, r10 = 0 (use cursor)
+        ctx.gpr[3] = file as u64;
+        ctx.gpr[4] = evt as u64;
+        ctx.gpr[7] = iosb as u64;
+        ctx.gpr[8] = buf as u64;
+        ctx.gpr[9] = 4;
+        ctx.gpr[10] = 0;
+        nt_read_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0, "STATUS_SUCCESS expected");
+        assert!(event_signaled(&state, evt), "event must be signaled on success");
+    }
+
+    #[test]
+    fn nt_read_file_signals_event_on_eof() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let file = make_file(&mut state, vec![0x01, 0x02]);
+        // Seek cursor past end by issuing a first read that drains it.
+        if let Some(KernelObject::File { position, .. }) = state.objects.get_mut(&file) {
+            *position = 2;
+        }
+        let evt = make_event(&mut state);
+        ctx.gpr[3] = file as u64;
+        ctx.gpr[4] = evt as u64;
+        ctx.gpr[7] = 0x4000_0000;
+        ctx.gpr[8] = 0x4000_0100;
+        ctx.gpr[9] = 4;
+        ctx.gpr[10] = 0;
+        nt_read_file(&mut ctx, &mut mem, &mut state);
+        assert!(event_signaled(&state, evt), "EOF path must still signal");
+    }
+
+    #[test]
+    fn nt_read_file_signals_event_on_invalid_handle() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let evt = make_event(&mut state);
+        ctx.gpr[3] = 0xDEAD_BEEF; // bogus file handle
+        ctx.gpr[4] = evt as u64;
+        ctx.gpr[7] = 0x4000_0000;
+        ctx.gpr[8] = 0x4000_0100;
+        ctx.gpr[9] = 4;
+        ctx.gpr[10] = 0;
+        nt_read_file(&mut ctx, &mut mem, &mut state);
+        assert!(event_signaled(&state, evt), "invalid-handle path must still signal");
+    }
+
+    /// Many callers pass r4 = 0 (synchronous-wait style). The signal helper
+    /// must no-op rather than corrupt the handle table or panic.
+    #[test]
+    fn nt_read_file_accepts_null_event_handle() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let file = make_file(&mut state, vec![0xAA; 8]);
+        ctx.gpr[3] = file as u64;
+        ctx.gpr[4] = 0;
+        ctx.gpr[7] = 0x4000_0000;
+        ctx.gpr[8] = 0x4000_0100;
+        ctx.gpr[9] = 8;
+        ctx.gpr[10] = 0;
+        nt_read_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0, "STATUS_SUCCESS expected with null event");
+    }
+
+    #[test]
+    fn nt_write_file_signals_completion_event() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let evt = make_event(&mut state);
+        ctx.gpr[3] = 0x1234; // file handle not consulted on the discard path
+        ctx.gpr[4] = evt as u64;
+        ctx.gpr[7] = 0x4000_0000;
+        ctx.gpr[9] = 16;
+        nt_write_file(&mut ctx, &mut mem, &mut state);
+        assert!(event_signaled(&state, evt), "write must signal too");
+    }
+
+    /// Verify `FileStandardInformation` reports `Directory=1` for empty-path
+    /// (device-root) synthesized file handles. Sylpheed calls
+    /// `NtCreateFile("game:\\")` then `NtQueryInformationFile` on the returned
+    /// handle as a disc-validation probe — seeing `Directory=0` triggers its
+    /// `XamShowDirtyDiscErrorUI` path.
+    #[test]
+    fn nt_query_information_file_reports_directory_for_root_synth() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Synth a "game:\" style empty-path file, matching what `open_vfs_file`
+        // produces when the prefix-strip leaves nothing behind.
+        let h = state.alloc_handle_for(KernelObject::File {
+            path: String::new(),
+            size: 0,
+            position: 0,
+            data: std::sync::Arc::new(Vec::new()),
+            dir_enum_pos: None,
+        });
+        let info_buf = SCRATCH_BASE + 0x600;
+        ctx.gpr[3] = h as u64; // handle
+        ctx.gpr[4] = SCRATCH_BASE as u64; // iosb
+        ctx.gpr[5] = info_buf as u64; // file_info
+        ctx.gpr[6] = 24; // length
+        ctx.gpr[7] = 5; // FileStandardInformation
+        nt_query_information_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0, "STATUS_SUCCESS expected");
+        assert_eq!(
+            mem.read_u8(info_buf + 21),
+            1,
+            "Directory byte must be 1 for root-of-device synth"
+        );
+    }
+
+    /// `NtQueryDirectoryFile` takes an optional completion event at r4
+    /// (Canary `xboxkrnl_io.cc:516`). The handler must signal that event
+    /// so waiters wake up, and must write the IOSB at r7 (the prior stub
+    /// mis-used r4, clobbering low guest memory). Without a VFS mounted
+    /// the handler finds no children and reports
+    /// `STATUS_NO_MORE_FILES`; the event still has to fire.
+    #[test]
+    fn nt_query_directory_file_signals_completion_event_and_uses_correct_iosb_reg() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let evt = make_event(&mut state);
+        // A root-shaped synth directory — exactly what `NtCreateFile("game:\\")`
+        // produces when the prefix-strip leaves nothing behind.
+        let handle = state.alloc_handle_for(KernelObject::File {
+            path: String::new(),
+            size: 0,
+            position: 0,
+            data: std::sync::Arc::new(Vec::new()),
+            dir_enum_pos: None,
+        });
+        let buf = SCRATCH_BASE + 0x100;
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = evt as u64;
+        ctx.gpr[7] = SCRATCH_BASE as u64; // IOSB must land here
+        ctx.gpr[8] = buf as u64;
+        ctx.gpr[9] = 128; // length >= 72 (Canary minimum)
+        ctx.gpr[10] = 0;
+        nt_query_directory_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_NO_MORE_FILES);
+        assert_eq!(mem.read_u32(SCRATCH_BASE), STATUS_NO_MORE_FILES as u32);
+        assert!(event_signaled(&state, evt), "completion event must be signaled");
+    }
+
+    /// Info-length-mismatch (Canary: length < 72 → STATUS_INFO_LENGTH_MISMATCH).
+    #[test]
+    fn nt_query_directory_file_rejects_short_buffer() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = state.alloc_handle_for(KernelObject::File {
+            path: String::new(),
+            size: 0,
+            position: 0,
+            data: std::sync::Arc::new(Vec::new()),
+            dir_enum_pos: None,
+        });
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 0;
+        ctx.gpr[7] = 0;
+        ctx.gpr[8] = SCRATCH_BASE as u64;
+        ctx.gpr[9] = 16; // below 72
+        ctx.gpr[10] = 0;
+        nt_query_directory_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INFO_LENGTH_MISMATCH);
+    }
+
+    /// Minimal `VfsDevice` impl that returns a hard-coded entry list —
+    /// lets us drive `NtQueryDirectoryFile` through a real enumeration
+    /// path without needing a disc image on disk.
+    struct StubVfs {
+        entries: Vec<xenia_vfs::VfsEntry>,
+    }
+    impl xenia_vfs::VfsDevice for StubVfs {
+        fn name(&self) -> &str { "stub" }
+        fn list_root(&self) -> Result<Vec<xenia_vfs::VfsEntry>, xenia_vfs::VfsError> {
+            Ok(self.entries.clone())
+        }
+        fn read_file(&self, _path: &str) -> Result<Vec<u8>, xenia_vfs::VfsError> {
+            Err(xenia_vfs::VfsError::NotFound("stub".into()))
+        }
+        fn stat(&self, _path: &str) -> Result<xenia_vfs::VfsEntry, xenia_vfs::VfsError> {
+            Err(xenia_vfs::VfsError::NotFound("stub".into()))
+        }
+    }
+
+    /// Real enumeration of the root directory. The stub VFS exposes two
+    /// top-level entries and one nested entry; `NtQueryDirectoryFile`
+    /// must return the two top-level ones and skip the grandchild.
+    #[test]
+    fn nt_query_directory_file_enumerates_root_children() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        state.vfs = Some(Box::new(StubVfs {
+            entries: vec![
+                xenia_vfs::VfsEntry {
+                    name: "default.xex".into(),
+                    is_directory: false,
+                    size: 0x1000,
+                    offset: 0,
+                },
+                xenia_vfs::VfsEntry {
+                    name: "dat".into(),
+                    is_directory: true,
+                    size: 0,
+                    offset: 0,
+                },
+                // A grandchild — must NOT appear in root enumeration.
+                xenia_vfs::VfsEntry {
+                    name: "dat/tables.pak".into(),
+                    is_directory: false,
+                    size: 0x2000,
+                    offset: 0,
+                },
+            ],
+        }));
+        let handle = state.alloc_handle_for(KernelObject::File {
+            path: String::new(),
+            size: 0,
+            position: 0,
+            data: std::sync::Arc::new(Vec::new()),
+            dir_enum_pos: None,
+        });
+        let buf = SCRATCH_BASE + 0x100;
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 0;
+        ctx.gpr[7] = SCRATCH_BASE as u64;
+        ctx.gpr[8] = buf as u64;
+        ctx.gpr[9] = 512;
+        ctx.gpr[10] = 0;
+        nt_query_directory_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        // First entry header lives at `buf`: file_name_length at +0x3C,
+        // attributes at +0x38, name bytes starting at +0x40. Verify both
+        // entries land in the buffer by walking the linked list via
+        // NextEntryOffset.
+        let mut cursor: u32 = 0;
+        let mut names: Vec<String> = Vec::new();
+        loop {
+            let entry_base = buf + cursor;
+            let name_len = mem.read_u32(entry_base + 0x3C) as usize;
+            let mut bytes = Vec::with_capacity(name_len);
+            for i in 0..name_len as u32 {
+                bytes.push(mem.read_u8(entry_base + 0x40 + i));
+            }
+            names.push(String::from_utf8(bytes).unwrap());
+            let next = mem.read_u32(entry_base);
+            if next == 0 {
+                break;
+            }
+            cursor += next;
+        }
+        assert_eq!(names, vec!["default.xex", "dat"]);
+        // A second call on the same handle must return NO_MORE_FILES —
+        // the cursor has advanced past the end.
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 0;
+        ctx.gpr[7] = SCRATCH_BASE as u64;
+        ctx.gpr[8] = buf as u64;
+        ctx.gpr[9] = 512;
+        nt_query_directory_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_NO_MORE_FILES);
+    }
+
+    /// Invalid handle → STATUS_INVALID_HANDLE, IOSB gets the error, and
+    /// the completion event still fires so callers don't hang.
+    #[test]
+    fn nt_query_directory_file_invalid_handle_still_signals() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let evt = make_event(&mut state);
+        ctx.gpr[3] = 0xDEAD_BEEF;
+        ctx.gpr[4] = evt as u64;
+        ctx.gpr[7] = SCRATCH_BASE as u64;
+        ctx.gpr[8] = SCRATCH_BASE as u64 + 0x100;
+        ctx.gpr[9] = 128;
+        ctx.gpr[10] = 0;
+        nt_query_directory_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INVALID_HANDLE);
+        assert_eq!(mem.read_u32(SCRATCH_BASE), STATUS_INVALID_HANDLE as u32);
+        assert!(event_signaled(&state, evt));
+    }
+
+    /// `NtSignalAndWaitForSingleObjectEx` signals handle A, then does a
+    /// single wait on handle B. If A is already signaled via the atomic
+    /// set, any waiter on A wakes immediately; the caller then parks on
+    /// B (or returns success if B is already signaled). Canary reference:
+    /// `xboxkrnl_threading.cc:1103` — `XObject::SignalAndWait`.
+    #[test]
+    fn nt_signal_and_wait_signals_first_then_waits() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Pre-signaled event we'll wait on — so the whole call returns success.
+        let wait_h = state.alloc_handle_for(KernelObject::Event {
+            manual_reset: true,
+            signaled: true,
+            waiters: Vec::new(),
+        });
+        let signal_h = make_event(&mut state); // starts unsignaled
+        ctx.gpr[3] = signal_h as u64;
+        ctx.gpr[4] = wait_h as u64;
+        ctx.gpr[7] = 0; // timeout_ptr = null → infinite, but wait-handle already signaled
+        nt_signal_and_wait_for_single_object_ex(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0, "wait returns STATUS_SUCCESS");
+        assert!(event_signaled(&state, signal_h), "signal handle set");
+    }
+
+    /// An unknown signal handle must return `STATUS_INVALID_HANDLE` and
+    /// NOT fall through to the wait — matches Canary's early-return guard.
+    #[test]
+    fn nt_signal_and_wait_rejects_unknown_signal_handle() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        ctx.gpr[3] = 0xDEAD_BEEF;
+        ctx.gpr[4] = 0x1234_5678;
+        ctx.gpr[7] = 0;
+        nt_signal_and_wait_for_single_object_ex(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INVALID_HANDLE);
+    }
+
+    /// `FileNetworkOpenInformation` (class 34) — 56 bytes, `FileAttributes`
+    /// at +48 must carry `FILE_ATTRIBUTE_DIRECTORY` (0x10) for root synths.
+    /// Sylpheed's async worker asks for this class and the caller dispatches
+    /// on the attributes bits; a zeroed buffer meant `Directory` was clear
+    /// and forced the dirty-disc path.
+    #[test]
+    fn nt_query_information_file_network_open_sets_dir_attribute() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let h = state.alloc_handle_for(KernelObject::File {
+            path: String::new(),
+            size: 0,
+            position: 0,
+            data: std::sync::Arc::new(Vec::new()),
+            dir_enum_pos: None,
+        });
+        let info_buf = SCRATCH_BASE + 0x200;
+        ctx.gpr[3] = h as u64;
+        ctx.gpr[4] = SCRATCH_BASE as u64;
+        ctx.gpr[5] = info_buf as u64;
+        ctx.gpr[6] = 56;
+        ctx.gpr[7] = 34; // FileNetworkOpenInformation
+        nt_query_information_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0);
+        let attrs = mem.read_u32(info_buf + 48);
+        assert_eq!(attrs, 0x10, "FILE_ATTRIBUTE_DIRECTORY expected for root synth");
+    }
+
+    /// Normal file paths must still report `Directory=0` so games reading
+    /// actual files (`dat/tables.pak`, `config.ini`) don't see them as
+    /// directories.
+    #[test]
+    fn nt_query_information_file_reports_file_for_normal_path() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let h = state.alloc_handle_for(KernelObject::File {
+            path: "dat/tables.pak".to_string(),
+            size: 964,
+            position: 0,
+            data: std::sync::Arc::new(vec![0; 964]),
+            dir_enum_pos: None,
+        });
+        let info_buf = SCRATCH_BASE + 0x700;
+        ctx.gpr[3] = h as u64;
+        ctx.gpr[4] = SCRATCH_BASE as u64;
+        ctx.gpr[5] = info_buf as u64;
+        ctx.gpr[6] = 24;
+        ctx.gpr[7] = 5;
+        nt_query_information_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(mem.read_u8(info_buf + 21), 0, "normal file not directory");
+    }
+
+    // ===== PKEVENT shim =====
+
+    /// Write a DISPATCHER_HEADER at the given guest pointer.
+    ///   ty: 0 = Notification (manual-reset), 1 = Synchronization (auto-reset),
+    ///       5 = Semaphore.
+    fn write_dispatcher_header(mem: &GuestMemory, ptr: u32, ty: u8, signal_state: u32) {
+        mem.write_u8(ptr, ty);
+        mem.write_u8(ptr + 1, 0); // Absolute
+        mem.write_u8(ptr + 2, 4); // Size (u32 words) — four words is plausible
+        mem.write_u8(ptr + 3, 0); // Inserted
+        mem.write_u32(ptr + 4, signal_state);
+        // WaitListHead (8 bytes) — zero-init is fine; shadow owns waiters.
+        mem.write_u32(ptr + 8, 0);
+        mem.write_u32(ptr + 12, 0);
+    }
+
+    #[test]
+    fn ke_set_event_shadows_pkevent_pointer() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let kevent_ptr = SCRATCH_BASE + 0x100;
+        write_dispatcher_header(&mut mem, kevent_ptr, 1, 0); // synchronization, unsignaled
+        ctx.gpr[3] = kevent_ptr as u64;
+        ke_set_event(&mut ctx, &mut mem, &mut state);
+        // Shadow must have been minted AND signaled.
+        match state.objects.get(&kevent_ptr) {
+            Some(KernelObject::Event { manual_reset, signaled, .. }) => {
+                assert!(!*manual_reset, "type=1 must be auto-reset");
+                assert!(*signaled, "ke_set_event must signal the shadow");
+            }
+            other => panic!("expected Event shadow at pkevent_ptr, got {:?}", other),
+        }
+    }
+
+    #[test]
+    fn ke_reset_event_shadows_pkevent_pointer() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let kevent_ptr = SCRATCH_BASE + 0x200;
+        // Initial signal state = 1 in guest memory → shadow starts signaled.
+        write_dispatcher_header(&mut mem, kevent_ptr, 0, 1); // notification
+        ctx.gpr[3] = kevent_ptr as u64;
+        ke_reset_event(&mut ctx, &mut mem, &mut state);
+        // After reset, shadow exists and is unsignaled; gpr[3] reports previous=1.
+        assert_eq!(ctx.gpr[3], 1, "previous state must be reported");
+        match state.objects.get(&kevent_ptr) {
+            Some(KernelObject::Event { manual_reset, signaled, .. }) => {
+                assert!(*manual_reset, "type=0 must be manual-reset");
+                assert!(!*signaled, "ke_reset_event must clear the shadow");
+            }
+            other => panic!("expected Event shadow, got {:?}", other),
+        }
+    }
+
+    /// End-to-end: set + wait across the same PKEVENT pointer. This is the
+    /// exact contract Sylpheed relies on — without the shim, KeWait parks
+    /// on a nonexistent handle and KeSet no-ops, so the wait never resolves.
+    #[test]
+    fn ke_set_then_wait_on_pkevent_returns_success() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let kevent_ptr = SCRATCH_BASE + 0x300;
+        write_dispatcher_header(&mut mem, kevent_ptr, 1, 0); // synchronization
+        // First signal the event.
+        ctx.gpr[3] = kevent_ptr as u64;
+        ke_set_event(&mut ctx, &mut mem, &mut state);
+        // Now wait with timeout = 0 (poll). Since it's signaled, the auto-
+        // reset consumes the signal and we should get STATUS_SUCCESS.
+        // Timeout pointer at scratch top: LARGE_INTEGER = 0.
+        let timeout_ptr = SCRATCH_BASE + 0x800;
+        mem.write_u32(timeout_ptr, 0);
+        mem.write_u32(timeout_ptr + 4, 0);
+        ctx.gpr[3] = kevent_ptr as u64;
+        ctx.gpr[7] = timeout_ptr as u64;
+        ke_wait_for_single_object(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0, "STATUS_SUCCESS expected on signaled wait");
+        // Auto-reset: signal must have been consumed.
+        match state.objects.get(&kevent_ptr) {
+            Some(KernelObject::Event { signaled, .. }) => assert!(!*signaled),
+            other => panic!("expected Event shadow, got {:?}", other),
+        }
+    }
+
+    /// Semaphore shim: header type 5, Limit at +0x10.
+    #[test]
+    fn ke_release_semaphore_shadows_pksemaphore_pointer() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let ksem_ptr = SCRATCH_BASE + 0x400;
+        write_dispatcher_header(&mut mem, ksem_ptr, 5, 2); // initial count 2
+        mem.write_u32(ksem_ptr + 0x10, 10); // Limit
+        ctx.gpr[3] = ksem_ptr as u64;
+        ctx.gpr[4] = 1; // adjust = +1
+        ke_release_semaphore(&mut ctx, &mut mem, &mut state);
+        match state.objects.get(&ksem_ptr) {
+            Some(KernelObject::Semaphore { count, max, .. }) => {
+                assert_eq!(*count, 3, "count was 2, +1 → 3");
+                assert_eq!(*max, 10);
+            }
+            other => panic!("expected Semaphore shadow, got {:?}", other),
+        }
+    }
+
+    /// Regression guard: genuine Nt handles must still work unchanged —
+    /// the shim's lower-bound check (`ptr < 0x1_0000`) skips our handle
+    /// range (0x1000 + 4·N).
+    #[test]
+    fn ke_set_event_leaves_nt_handles_intact() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = state.alloc_handle_for(KernelObject::Event {
+            manual_reset: true,
+            signaled: false,
+            waiters: Vec::new(),
+        });
+        assert!(handle < 0x1_0000, "handle must be in low range");
+        ctx.gpr[3] = handle as u64;
+        ke_set_event(&mut ctx, &mut mem, &mut state);
+        // Shadow must NOT have been created at the handle key (already exists);
+        // the existing Event just flips to signaled.
+        match state.objects.get(&handle) {
+            Some(KernelObject::Event { signaled, .. }) => assert!(*signaled),
+            _ => panic!("handle lookup broken"),
+        }
+    }
+
+    /// Type bytes we don't understand (e.g., Mutant=2, Timer=8) must leave
+    /// the handle table untouched rather than conjuring wrong-typed shadows.
+    #[test]
+    fn ensure_dispatcher_object_ignores_unknown_type() {
+        let (mut _ctx, mut mem, mut state) = fresh();
+        let ptr = SCRATCH_BASE + 0x500;
+        write_dispatcher_header(&mut mem, ptr, 2, 0); // Mutant — unsupported
+        ensure_dispatcher_object(&mut state, &mem, ptr);
+        assert!(!state.objects.contains_key(&ptr), "no shadow for unknown type");
+    }
+
+    /// `KePulseEvent` on a manual-reset event must wake every parked waiter
+    /// and leave the event unsignaled afterwards. This models the transient-
+    /// signal idiom that `NtSetEvent`+`NtClearEvent` cannot express atomically.
+    #[test]
+    fn ke_pulse_event_manual_reset_wakes_all_and_leaves_unsignaled() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let kevent_ptr = SCRATCH_BASE + 0x600;
+        write_dispatcher_header(&mut mem, kevent_ptr, 0, 0); // manual-reset, unsignaled
+        // Mint the shadow and park two fake waiters.
+        ctx.gpr[3] = kevent_ptr as u64;
+        ke_reset_event(&mut ctx, &mut mem, &mut state);
+        match state.objects.get_mut(&kevent_ptr) {
+            Some(KernelObject::Event { waiters, .. }) => {
+                // Fake waiter refs — wake_ref silently no-ops on
+                // out-of-bounds so the test only observes list drainage.
+                waiters.push(ThreadRef { hw_id: 2, idx: 0, generation: 0 });
+                waiters.push(ThreadRef { hw_id: 3, idx: 0, generation: 0 });
+            }
+            _ => panic!("shadow not minted"),
+        }
+        // Pulse.
+        ctx.gpr[3] = kevent_ptr as u64;
+        ke_pulse_event(&mut ctx, &mut mem, &mut state);
+        // Previous state = 0 (unsignaled).
+        assert_eq!(ctx.gpr[3], 0);
+        // Event must be unsignaled post-pulse, and waiter list drained.
+        match state.objects.get(&kevent_ptr) {
+            Some(KernelObject::Event { signaled, waiters, .. }) => {
+                assert!(!*signaled, "pulse leaves event non-signaled");
+                assert!(waiters.is_empty(), "all manual-reset waiters must be woken");
+            }
+            _ => panic!("shadow vanished"),
+        }
+    }
+
+    /// Auto-reset pulse wakes exactly one waiter (the head of the FIFO) and
+    /// consumes the transient signal, matching `NtSetEvent` on an auto-reset
+    /// event with no linger.
+    #[test]
+    fn ke_pulse_event_auto_reset_wakes_one() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let kevent_ptr = SCRATCH_BASE + 0x700;
+        write_dispatcher_header(&mut mem, kevent_ptr, 1, 0); // auto-reset, unsignaled
+        ctx.gpr[3] = kevent_ptr as u64;
+        ke_reset_event(&mut ctx, &mut mem, &mut state);
+        match state.objects.get_mut(&kevent_ptr) {
+            Some(KernelObject::Event { waiters, .. }) => {
+                // Fake waiter refs — wake_ref silently no-ops on
+                // out-of-bounds so the test only observes list drainage.
+                waiters.push(ThreadRef { hw_id: 2, idx: 0, generation: 0 });
+                waiters.push(ThreadRef { hw_id: 3, idx: 0, generation: 0 });
+            }
+            _ => panic!("shadow not minted"),
+        }
+        ctx.gpr[3] = kevent_ptr as u64;
+        ke_pulse_event(&mut ctx, &mut mem, &mut state);
+        match state.objects.get(&kevent_ptr) {
+            Some(KernelObject::Event { signaled, waiters, .. }) => {
+                assert!(!*signaled, "pulse leaves auto-reset event non-signaled");
+                assert_eq!(waiters.len(), 1, "auto-reset pulse wakes exactly one waiter");
+            }
+            _ => panic!("shadow vanished"),
+        }
+    }
+
+    /// `NtPulseEvent` must return `STATUS_SUCCESS` + write prior state to
+    /// the optional `previous_state_ptr` (r4). If the handle is invalid,
+    /// it must return `STATUS_INVALID_HANDLE` without touching memory.
+    #[test]
+    fn nt_pulse_event_writes_previous_state_and_clears() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = state.alloc_handle_for(KernelObject::Event {
+            manual_reset: true,
+            signaled: true, // initially signaled → prior = 1
+            waiters: Vec::new(),
+        });
+        let prev_ptr = SCRATCH_BASE + 0x10;
+        mem.write_u32(prev_ptr, 0xFFFF_FFFF); // sentinel
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = prev_ptr as u64;
+        nt_pulse_event(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        assert_eq!(mem.read_u32(prev_ptr), 1, "previous state was signaled=1");
+        match state.objects.get(&handle) {
+            Some(KernelObject::Event { signaled, .. }) => {
+                assert!(!*signaled, "nt_pulse_event must leave event cleared");
+            }
+            _ => panic!("handle lost"),
+        }
+    }
+
+    #[test]
+    fn nt_pulse_event_invalid_handle_returns_status() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        ctx.gpr[3] = 0xDEAD_BEEF; // not in object table
+        ctx.gpr[4] = 0;
+        nt_pulse_event(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INVALID_HANDLE);
+    }
+
+    /// `NtReleaseSemaphore` must return `STATUS_SEMAPHORE_LIMIT_EXCEEDED`
+    /// (0xC000_0047) when the post-release count would exceed `Limit`,
+    /// and must *not* update the count in that case. The prior
+    /// saturating-add behaviour silently clamped to i32::MAX, masking
+    /// overflow from games that key work-queue logic on the status code.
+    #[test]
+    fn nt_release_semaphore_rejects_over_limit() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = state.alloc_handle_for(KernelObject::Semaphore {
+            count: 3,
+            max: 5,
+            waiters: Vec::new(),
+        });
+        let prev_ptr = SCRATCH_BASE + 0x40;
+        mem.write_u32(prev_ptr, 0xFFFF_FFFF);
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 10; // 3 + 10 = 13 > max=5 → reject
+        ctx.gpr[5] = prev_ptr as u64;
+        nt_release_semaphore(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SEMAPHORE_LIMIT_EXCEEDED);
+        assert_eq!(mem.read_u32(prev_ptr), 3, "previous count written even on reject");
+        match state.objects.get(&handle) {
+            Some(KernelObject::Semaphore { count, .. }) => {
+                assert_eq!(*count, 3, "count must not change on reject");
+            }
+            _ => panic!("handle lost"),
+        }
+    }
+
+    /// A normal release inside the limit increments `count` and returns
+    /// `STATUS_SUCCESS` with the previous count written out.
+    #[test]
+    fn nt_release_semaphore_normal_path_updates_count() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = state.alloc_handle_for(KernelObject::Semaphore {
+            count: 2,
+            max: 5,
+            waiters: Vec::new(),
+        });
+        let prev_ptr = SCRATCH_BASE + 0x50;
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 2; // 2 + 2 = 4 <= 5 → ok
+        ctx.gpr[5] = prev_ptr as u64;
+        nt_release_semaphore(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        assert_eq!(mem.read_u32(prev_ptr), 2);
+        match state.objects.get(&handle) {
+            Some(KernelObject::Semaphore { count, .. }) => assert_eq!(*count, 4),
+            _ => panic!("handle lost"),
+        }
+    }
+
+    /// Invalid handle path: Canary returns `STATUS_INVALID_HANDLE`
+    /// without touching any state. Previous behaviour silently returned
+    /// `STATUS_SUCCESS` with `previous = 0`, which games couldn't tell
+    /// from a genuine release.
+    #[test]
+    fn nt_release_semaphore_invalid_handle_returns_status() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        ctx.gpr[3] = 0xDEAD_BEEF;
+        ctx.gpr[4] = 1;
+        ctx.gpr[5] = 0;
+        nt_release_semaphore(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INVALID_HANDLE);
+    }
+
+    /// `RtlInitializeCriticalSection` must lay out the guest-visible
+    /// X_RTL_CRITICAL_SECTION per Canary `xboxkrnl_rtl.cc:536-553`:
+    /// dispatcher-header type=1 at +0x00, lock_count=-1 at +0x10,
+    /// recursion_count=0 at +0x14, owning_thread=0 at +0x18. Prior to
+    /// this fix xenia-rs wrote lock_count at +0x04 (landing inside the
+    /// dispatcher header's signal_state field) and owning_thread at
+    /// +0x0C (landing inside the WaitListHead). Any game that reads a
+    /// pre-initialized CS from its `.data` segment — Canary's comment
+    /// at line 533-534 notes this is common — would see garbage.
+    #[test]
+    fn rtl_initialize_critical_section_lays_out_canary_struct() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let cs_ptr = SCRATCH_BASE + 0x100;
+        // Pre-fill with a sentinel so we can see every byte we touch.
+        for i in (0..28).step_by(4) {
+            mem.write_u32(cs_ptr + i, 0xDEAD_BEEF);
+        }
+        ctx.gpr[3] = cs_ptr as u64;
+        rtl_initialize_critical_section(&mut ctx, &mut mem, &mut state);
+        assert_eq!(mem.read_u8(cs_ptr + CS_OFFS_TYPE), 1, "type = synchronization");
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT), 0xFFFF_FFFF, "lock_count = -1");
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT), 0);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_OWNING_THREAD), 0);
+    }
+
+    /// End-to-end: init → enter → nested enter → leave → leave. The
+    /// CS must roll back through `(lc=0,rc=2) → (lc=0,rc=1) → (lc=-1,
+    /// rc=0,owner=0)` with the correct field offsets, and owner must
+    /// land at +0x18 — not anywhere else.
+    #[test]
+    fn rtl_critical_section_nested_enter_leave_roundtrip() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Install a live guest TID on the current HW slot so
+        // `rtl_enter_critical_section`'s `find_by_tid` sees us as a
+        // genuine owner. `find_by_tid` filters out `HwState::Idle` —
+        // the default placeholder state — so we also flip to Ready.
+        // Without both, `owner_is_live` stays false on self-recursion
+        // and the nested-enter branch is never taken.
+        let tid: u32 = 42;
+        // Update the live thread planted by `fresh()` on slot 0 so
+        // `find_by_tid(42)` resolves it.
+        state.scheduler.slots[0].runqueue[0].tid = tid;
+        state.scheduler.slots[0].runqueue[0].state = xenia_cpu::scheduler::HwState::Ready;
+        ctx.thread_id = tid;
+        let cs_ptr = SCRATCH_BASE + 0x200;
+        ctx.gpr[3] = cs_ptr as u64;
+        rtl_initialize_critical_section(&mut ctx, &mut mem, &mut state);
+        // First enter → owner = tid, LC = 0, RC = 1.
+        ctx.gpr[3] = cs_ptr as u64;
+        rtl_enter_critical_section(&mut ctx, &mut mem, &mut state);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_OWNING_THREAD), tid);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32, 0);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT), 1);
+        // Nested enter (same tid) → LC = 1, RC = 2.
+        ctx.gpr[3] = cs_ptr as u64;
+        rtl_enter_critical_section(&mut ctx, &mut mem, &mut state);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32, 1);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT), 2);
+        // First leave → LC = 0, RC = 1, owner stays.
+        ctx.gpr[3] = cs_ptr as u64;
+        rtl_leave_critical_section(&mut ctx, &mut mem, &mut state);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_OWNING_THREAD), tid);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32, 0);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT), 1);
+        // Second leave → LC = -1, RC = 0, owner cleared.
+        ctx.gpr[3] = cs_ptr as u64;
+        rtl_leave_critical_section(&mut ctx, &mut mem, &mut state);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_OWNING_THREAD), 0);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_LOCK_COUNT) as i32, -1);
+        assert_eq!(mem.read_u32(cs_ptr + CS_OFFS_RECURSION_COUNT), 0);
+    }
+
+    /// `NtSetInformationFile` class 14 (`XFilePositionInformation`) must
+    /// update the file cursor. Read back via `NtQueryInformationFile`
+    /// class 14 — round-trip proves both sides agree on the layout.
+    #[test]
+    fn nt_set_information_file_position_updates_cursor() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = make_file(&mut state, vec![0u8; 0x100]);
+        let info_ptr = SCRATCH_BASE + 0x20;
+        let iosb_ptr = SCRATCH_BASE + 0x40;
+        mem.write_u64(info_ptr, 0x40);
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = iosb_ptr as u64;
+        ctx.gpr[5] = info_ptr as u64;
+        ctx.gpr[6] = 8;
+        ctx.gpr[7] = 14; // XFilePositionInformation
+        nt_set_information_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        assert_eq!(mem.read_u32(iosb_ptr), STATUS_SUCCESS as u32);
+        assert_eq!(mem.read_u32(iosb_ptr + 4), 8);
+        match state.objects.get(&handle) {
+            Some(KernelObject::File { position, .. }) => assert_eq!(*position, 0x40),
+            _ => panic!("file handle lost"),
+        }
+    }
+
+    /// Read-only VFS — truncating to a different size must fail with
+    /// `STATUS_UNSUCCESSFUL`, matching Canary's error path when
+    /// `file->SetLength(...)` can't honour the request.
+    #[test]
+    fn nt_set_information_file_truncate_to_different_size_fails() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = make_file(&mut state, vec![0u8; 0x100]);
+        let info_ptr = SCRATCH_BASE + 0x80;
+        mem.write_u64(info_ptr, 0x200); // new EOF != current 0x100
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 0;
+        ctx.gpr[5] = info_ptr as u64;
+        ctx.gpr[6] = 8;
+        ctx.gpr[7] = 20; // XFileEndOfFileInformation
+        nt_set_information_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_UNSUCCESSFUL);
+    }
+
+    #[test]
+    fn nt_set_information_file_invalid_class_returns_status() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = make_file(&mut state, Vec::new());
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 0;
+        ctx.gpr[5] = 0;
+        ctx.gpr[6] = 0;
+        ctx.gpr[7] = 999; // not a defined class
+        nt_set_information_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INVALID_INFO_CLASS);
+    }
+
+    #[test]
+    fn nt_set_information_file_short_buffer_returns_length_mismatch() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle = make_file(&mut state, Vec::new());
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = 0;
+        ctx.gpr[5] = SCRATCH_BASE as u64;
+        ctx.gpr[6] = 4; // class 14 needs 8
+        ctx.gpr[7] = 14;
+        nt_set_information_file(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INFO_LENGTH_MISMATCH);
+    }
+
+    /// `KeReleaseSemaphore` is lenient: it never reports errors, but the
+    /// count must still cap at `Limit` (Canary's underlying primitive
+    /// `XSemaphore::ReleaseSemaphore` enforces the cap, even though the
+    /// Ke wrapper discards the success bool).
+    #[test]
+    fn ke_release_semaphore_silently_caps_at_limit() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let ksem_ptr = SCRATCH_BASE + 0x60;
+        // Dispatcher header: type=5 (Semaphore), signal_state/count=4, Limit=5.
+        write_dispatcher_header(&mut mem, ksem_ptr, 5, 4);
+        mem.write_u32(ksem_ptr + 0x10, 5); // Limit
+        ctx.gpr[3] = ksem_ptr as u64;
+        ctx.gpr[4] = 10; // 4 + 10 > 5 → reject silently
+        ke_release_semaphore(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 4, "Ke returns previous count even on cap");
+        match state.objects.get(&ksem_ptr) {
+            Some(KernelObject::Semaphore { count, .. }) => {
+                assert_eq!(*count, 4, "count must not exceed Limit even via Ke-form");
+            }
+            _ => panic!("shadow missing"),
+        }
+    }
+
+    // ===== Timer subsystem =====
+
+    /// Helper: write a LARGE_INTEGER (i64 in big-endian hi/lo u32 pair) to
+    /// guest memory. Matches the format `parse_timeout` / `nt_set_timer_ex`
+    /// read from.
+    fn write_large_integer(mem: &GuestMemory, ptr: u32, raw: i64) {
+        mem.write_u32(ptr, (raw >> 32) as u32);
+        mem.write_u32(ptr + 4, raw as u32);
+    }
+
+    #[test]
+    fn nt_create_timer_sync_type_creates_auto_reset() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[4] = 0; // obj_attributes — ignored
+        ctx.gpr[5] = 1; // SynchronizationTimer
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        let handle = mem.read_u32(handle_ptr);
+        match state.objects.get(&handle) {
+            Some(KernelObject::Timer {
+                manual_reset,
+                signaled,
+                deadline,
+                waiters,
+                ..
+            }) => {
+                assert!(!*manual_reset, "type=1 is SynchronizationTimer (auto-reset)");
+                assert!(!*signaled);
+                assert!(deadline.is_none());
+                assert!(waiters.is_empty());
+            }
+            other => panic!("expected Timer at handle {:#x}, got {:?}", handle, other),
+        }
+    }
+
+    #[test]
+    fn nt_create_timer_notification_type_creates_manual_reset() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 0; // NotificationTimer
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        let handle = mem.read_u32(handle_ptr);
+        match state.objects.get(&handle) {
+            Some(KernelObject::Timer { manual_reset, .. }) => assert!(*manual_reset),
+            _ => panic!("expected Timer"),
+        }
+    }
+
+    #[test]
+    fn nt_create_timer_invalid_type_returns_invalid_parameter() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        ctx.gpr[3] = (SCRATCH_BASE + 0x20) as u64;
+        ctx.gpr[5] = 42; // invalid
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0xC000_000D); // STATUS_INVALID_PARAMETER
+        assert!(
+            state.objects.is_empty()
+                || state
+                    .objects
+                    .values()
+                    .all(|o| !matches!(o, KernelObject::Timer { .. })),
+            "no Timer object must be minted on invalid type"
+        );
+    }
+
+    #[test]
+    fn nt_set_timer_ex_schedules_pending_fire() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Create the timer first.
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 1;
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        let handle = mem.read_u32(handle_ptr);
+
+        // Arm with -1_000_000 (= 100ms) relative.
+        let due_time_ptr = SCRATCH_BASE + 0x40;
+        write_large_integer(&mut mem, due_time_ptr, -1_000_000);
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = due_time_ptr as u64;
+        ctx.gpr[5] = 0; // routine
+        ctx.gpr[6] = 1; // mode
+        ctx.gpr[7] = 0; // routine_arg
+        ctx.gpr[8] = 0; // resume
+        ctx.gpr[9] = 0; // period_ms
+        nt_set_timer_ex(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+
+        assert_eq!(state.pending_timer_fires.len(), 1);
+        let (deadline, h) = state.pending_timer_fires[0];
+        assert_eq!(h, handle);
+        assert!(deadline > 0, "deadline must advance past now");
+        match state.objects.get(&handle) {
+            Some(KernelObject::Timer {
+                deadline: obj_d,
+                signaled,
+                ..
+            }) => {
+                assert_eq!(*obj_d, Some(deadline));
+                assert!(!*signaled, "arm clears any stale signaled flag");
+            }
+            _ => panic!("Timer vanished"),
+        }
+    }
+
+    #[test]
+    fn nt_set_timer_ex_rearm_replaces_entry() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 1;
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        let handle = mem.read_u32(handle_ptr);
+        let due_time_ptr = SCRATCH_BASE + 0x40;
+        // First arm.
+        write_large_integer(&mut mem, due_time_ptr, -1_000_000);
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = due_time_ptr as u64;
+        ctx.gpr[5] = 0;
+        ctx.gpr[6] = 1;
+        ctx.gpr[7] = 0;
+        ctx.gpr[8] = 0;
+        ctx.gpr[9] = 0;
+        nt_set_timer_ex(&mut ctx, &mut mem, &mut state);
+        // Second arm (later).
+        write_large_integer(&mut mem, due_time_ptr, -5_000_000);
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = due_time_ptr as u64;
+        nt_set_timer_ex(&mut ctx, &mut mem, &mut state);
+        assert_eq!(
+            state.pending_timer_fires.len(),
+            1,
+            "rearm must replace, not duplicate"
+        );
+    }
+
+    #[test]
+    fn nt_cancel_timer_disarms_and_writes_zero() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 1;
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        let handle = mem.read_u32(handle_ptr);
+        let due_time_ptr = SCRATCH_BASE + 0x40;
+        write_large_integer(&mut mem, due_time_ptr, -1_000_000);
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = due_time_ptr as u64;
+        ctx.gpr[5] = 0;
+        ctx.gpr[6] = 1;
+        ctx.gpr[7] = 0;
+        ctx.gpr[8] = 0;
+        ctx.gpr[9] = 0;
+        nt_set_timer_ex(&mut ctx, &mut mem, &mut state);
+
+        let prev_ptr = SCRATCH_BASE + 0x60;
+        mem.write_u32(prev_ptr, 0xDEAD_BEEF); // sentinel — must be overwritten to 0
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = prev_ptr as u64;
+        nt_cancel_timer(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
+        assert_eq!(mem.read_u32(prev_ptr), 0, "canary always writes 0");
+        assert!(state.pending_timer_fires.is_empty());
+        match state.objects.get(&handle) {
+            Some(KernelObject::Timer { deadline, .. }) => assert!(deadline.is_none()),
+            _ => panic!("Timer gone after cancel — must stay in table"),
+        }
+    }
+
+    #[test]
+    fn nt_cancel_timer_invalid_handle_returns_status() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        ctx.gpr[3] = 0xDEAD_BEEF;
+        ctx.gpr[4] = 0;
+        nt_cancel_timer(&mut ctx, &mut mem, &mut state);
+        assert_eq!(ctx.gpr[3], STATUS_INVALID_HANDLE);
+    }
+
+    #[test]
+    fn timer_fire_wakes_auto_reset_waiter_and_consumes_signal() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Arm an auto-reset timer with deadline slightly in the future.
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 1;
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        let handle = mem.read_u32(handle_ptr);
+        // Deadline: now + 1000 ticks. Directly set the state to avoid
+        // dependence on parse_timeout's divisor.
+        let now = state.scheduler.ctx(0).timebase;
+        let deadline = now + 1000;
+        match state.objects.get_mut(&handle) {
+            Some(KernelObject::Timer {
+                deadline: obj_d, ..
+            }) => *obj_d = Some(deadline),
+            _ => panic!("no timer"),
+        }
+        state.arm_timer(handle, deadline);
+
+        // Park the current (initial) thread on a WaitForSingleObject of the
+        // timer handle. `do_wait_single` sees signaled=false, enqueues the
+        // current ref, and parks via `park_current`.
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[6] = 0; // NULL timeout → wait forever
+        nt_wait_for_single_object_ex(&mut ctx, &mut mem, &mut state);
+        let initial_ref = state.scheduler.current_ref();
+        match state.scheduler.thread(initial_ref).state {
+            xenia_cpu::scheduler::HwState::Blocked(_) => {}
+            ref other => panic!("expected Blocked after wait, got {:?}", other),
+        }
+
+        // Advance time past the deadline; fire_due_timers should signal and
+        // wake the waiter.
+        state.scheduler.advance_all_timebases_to(deadline);
+        let fired = state.fire_due_timers();
+        assert!(fired);
+
+        // After fire on auto-reset: signaled cleared via handle_consume, no
+        // pending entry, waiter promoted to Ready.
+        match state.objects.get(&handle) {
+            Some(KernelObject::Timer {
+                signaled, waiters, ..
+            }) => {
+                assert!(!*signaled, "auto-reset consumed on single-waiter wake");
+                assert!(waiters.is_empty(), "waiter dequeued by wake_eligible_waiters");
+            }
+            _ => panic!("timer lost"),
+        }
+        assert!(state.pending_timer_fires.is_empty());
+        match state.scheduler.thread(initial_ref).state {
+            xenia_cpu::scheduler::HwState::Ready => {}
+            ref other => panic!("expected Ready after fire, got {:?}", other),
+        }
+    }
+
+    #[test]
+    fn timer_fire_manual_reset_wakes_all_and_stays_signaled() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Manual-reset timer.
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 0; // NotificationTimer (manual-reset)
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        let handle = mem.read_u32(handle_ptr);
+
+        let now = state.scheduler.ctx(0).timebase;
+        let deadline = now + 1000;
+        match state.objects.get_mut(&handle) {
+            Some(KernelObject::Timer {
+                deadline: obj_d, ..
+            }) => *obj_d = Some(deadline),
+            _ => unreachable!(),
+        }
+        state.arm_timer(handle, deadline);
+
+        // Park two synthetic waiters (out-of-bounds refs — `wake_ref`
+        // silently no-ops on them; we only care about the drain-all
+        // semantics of manual-reset.)
+        match state.objects.get_mut(&handle) {
+            Some(KernelObject::Timer { waiters, .. }) => {
+                waiters.push(ThreadRef { hw_id: 2, idx: 0, generation: 0 });
+                waiters.push(ThreadRef { hw_id: 3, idx: 0, generation: 0 });
+            }
+            _ => unreachable!(),
+        }
+
+        state.scheduler.advance_all_timebases_to(deadline);
+        assert!(state.fire_due_timers());
+        match state.objects.get(&handle) {
+            Some(KernelObject::Timer {
+                signaled, waiters, ..
+            }) => {
+                assert!(*signaled, "manual-reset stays signaled after fire");
+                assert!(waiters.is_empty(), "manual-reset drains all waiters");
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    #[test]
+    fn periodic_timer_rearms_after_fire() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 1;
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        let handle = mem.read_u32(handle_ptr);
+
+        let now = state.scheduler.ctx(0).timebase;
+        let deadline = now + 1000;
+        let period_ticks = 500;
+        match state.objects.get_mut(&handle) {
+            Some(KernelObject::Timer {
+                deadline: obj_d,
+                period_ticks: obj_p,
+                ..
+            }) => {
+                *obj_d = Some(deadline);
+                *obj_p = period_ticks;
+            }
+            _ => unreachable!(),
+        }
+        state.arm_timer(handle, deadline);
+
+        state.scheduler.advance_all_timebases_to(deadline);
+        assert!(state.fire_due_timers());
+
+        // After fire, a new entry must sit at deadline + period_ticks.
+        assert_eq!(state.pending_timer_fires.len(), 1);
+        let (new_deadline, h) = state.pending_timer_fires[0];
+        assert_eq!(h, handle);
+        assert_eq!(new_deadline, deadline + period_ticks);
+        match state.objects.get(&handle) {
+            Some(KernelObject::Timer { deadline: obj_d, .. }) => {
+                assert_eq!(*obj_d, Some(new_deadline));
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    #[test]
+    fn nt_close_scrubs_pending_timer_fires() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        let handle_ptr = SCRATCH_BASE + 0x20;
+        ctx.gpr[3] = handle_ptr as u64;
+        ctx.gpr[5] = 1;
+        nt_create_timer(&mut ctx, &mut mem, &mut state);
+        let handle = mem.read_u32(handle_ptr);
+        // Arm.
+        let due_time_ptr = SCRATCH_BASE + 0x40;
+        write_large_integer(&mut mem, due_time_ptr, -1_000_000);
+        ctx.gpr[3] = handle as u64;
+        ctx.gpr[4] = due_time_ptr as u64;
+        ctx.gpr[5] = 0;
+        ctx.gpr[6] = 1;
+        ctx.gpr[7] = 0;
+        ctx.gpr[8] = 0;
+        ctx.gpr[9] = 0;
+        nt_set_timer_ex(&mut ctx, &mut mem, &mut state);
+        assert_eq!(state.pending_timer_fires.len(), 1);
+        // Close.
+        ctx.gpr[3] = handle as u64;
+        nt_close(&mut ctx, &mut mem, &mut state);
+        assert!(
+            state.pending_timer_fires.is_empty(),
+            "nt_close must scrub pending timer entry"
+        );
+        assert!(!state.objects.contains_key(&handle));
+    }
+
+    #[test]
+    fn advance_to_next_wake_returns_ref_and_reason_for_timeout_path() {
+        let (mut ctx, mut mem, mut state) = fresh();
+        // Create an event (unsignaled), park current thread on it with a
+        // finite deadline via NtWaitForSingleObjectEx.
+        let ev = state.alloc_handle_for(KernelObject::Event {
+            manual_reset: false,
+            signaled: false,
+            waiters: Vec::new(),
+        });
+        let timeout_ptr = SCRATCH_BASE + 0x80;
+        write_large_integer(&mut mem, timeout_ptr, -1_000_000);
+        ctx.gpr[3] = ev as u64;
+        ctx.gpr[6] = timeout_ptr as u64;
+        nt_wait_for_single_object_ex(&mut ctx, &mut mem, &mut state);
+        let initial_ref = state.scheduler.current_ref();
+
+        // Current thread must be parked with that handle in its waiter list.
+        match state.objects.get(&ev) {
+            Some(KernelObject::Event { waiters, .. }) => {
+                assert!(waiters.contains(&initial_ref), "waiter enqueued");
+            }
+            _ => unreachable!(),
+        }
+
+        // Advance past the deadline. `advance_to_next_wake` returns the
+        // woken ref + its block reason; the main loop would then stamp
+        // STATUS_TIMEOUT and scrub waiter lists via `handle_timeout_wake`.
+        let (r, reason) = state
+            .scheduler
+            .advance_to_next_wake()
+            .expect("deadline exists");
+        assert_eq!(r, initial_ref);
+        state.handle_timeout_wake(r, reason);
+
+        // Post-wake: gpr[3] == STATUS_TIMEOUT (0x102) AND the waiter list
+        // scrubbed. Prior code returned 0 and left the waiter stranded.
+        assert_eq!(state.scheduler.ctx_mut_ref(r).gpr[3], 0x0000_0102);
+        match state.objects.get(&ev) {
+            Some(KernelObject::Event { waiters, .. }) => {
+                assert!(
+                    !waiters.contains(&initial_ref),
+                    "waiter scrubbed from handle list on timeout"
+                );
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    /// Ordinal 0xFB must resolve to `NtSignalAndWaitForSingleObjectEx`
+    /// (canary's table) — the former `NtSetInformationThread`
+    /// registration collided and was removed.
+    #[test]
+    fn ordinal_0xfb_maps_to_nt_signal_and_wait() {
+        let state = KernelState::new();
+        let name = state
+            .export_name(crate::state::ModuleId::Xboxkrnl, 0xFB)
+            .expect("0xFB must be registered");
+        assert_eq!(name, "NtSignalAndWaitForSingleObjectEx");
+    }
+
+    /// `KeInitializeSemaphore` must seed the count and limit fields in
+    /// guest memory so that `ensure_dispatcher_object` later mints the
+    /// kernel-side shadow with the caller's parameters — not the
+    /// zero-fill default of `count=0, max=1`.
+    #[test]
+    fn ke_initialize_semaphore_seeds_count_and_limit() {
+        let (mut ctx, mem, mut state) = fresh();
+        let sem_ptr = SCRATCH_BASE + 0x500;
+        ctx.gpr[3] = sem_ptr as u64;
+        ctx.gpr[4] = 3;
+        ctx.gpr[5] = 7;
+        ke_initialize_semaphore(&mut ctx, &mem, &mut state);
+        assert_eq!(mem.read_u8(sem_ptr), 5, "type=5 (semaphore)");
+        assert_eq!(mem.read_u32(sem_ptr + 0x04), 3, "signal_state=count");
+        assert_eq!(mem.read_u32(sem_ptr + 0x10), 7, "limit");
+
+        // Round-trip: KeReleaseSemaphore mints the shadow via
+        // `ensure_dispatcher_object`, which reads the fields we just wrote.
+        ctx.gpr[3] = sem_ptr as u64;
+        ctx.gpr[4] = 1;
+        ke_release_semaphore(&mut ctx, &mem, &mut state);
+        match state.objects.get(&sem_ptr) {
+            Some(KernelObject::Semaphore { count, max, .. }) => {
+                assert_eq!(*count, 4, "3 + 1 = 4");
+                assert_eq!(*max, 7, "limit must propagate from r5, not default to 1");
+            }
+            other => panic!("expected Semaphore shadow, got {:?}", other),
+        }
+        assert_eq!(ctx.gpr[3], 3, "previous count must be 3 (post-init, pre-release)");
+    }
+
+    /// `XexGetProcedureAddress` must honor r3=hmodule, look up the
+    /// (module, ordinal) in the thunk reverse-map, and write the address
+    /// to *r5. Three branches: success, unknown ordinal, unknown hmodule.
+    #[test]
+    fn xex_get_procedure_address_resolves_registered_thunk() {
+        let (mut ctx, mem, mut state) = fresh();
+        state.register_thunk(crate::state::ModuleId::Xboxkrnl, 0x12, 0x8200_1234);
+        let out_ptr = SCRATCH_BASE + 0x600;
+
+        // Success path.
+        mem.write_u32(out_ptr, 0xDEAD_BEEF);
+        ctx.gpr[3] = crate::state::HMODULE_XBOXKRNL as u64;
+        ctx.gpr[4] = 0x12;
+        ctx.gpr[5] = out_ptr as u64;
+        xex_get_procedure_address(&mut ctx, &mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0, "STATUS_SUCCESS");
+        assert_eq!(mem.read_u32(out_ptr), 0x8200_1234, "thunk address written");
+
+        // Unknown ordinal: STATUS_OBJECT_NAME_NOT_FOUND, *out cleared.
+        // Reset r3 because the prior call overwrote it with the status code.
+        mem.write_u32(out_ptr, 0xDEAD_BEEF);
+        ctx.gpr[3] = crate::state::HMODULE_XBOXKRNL as u64;
+        ctx.gpr[4] = 0x99;
+        xex_get_procedure_address(&mut ctx, &mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0xC000_0034);
+        assert_eq!(mem.read_u32(out_ptr), 0);
+
+        // Unknown hmodule: STATUS_INVALID_HANDLE.
+        ctx.gpr[3] = 0xCAFE_BABE;
+        ctx.gpr[4] = 0x12;
+        xex_get_procedure_address(&mut ctx, &mem, &mut state);
+        assert_eq!(ctx.gpr[3], 0xC000_0008);
+    }
+
+    /// `XexGetModuleHandle` must return distinct pseudo-handles for the
+    /// main image, xboxkrnl.exe, and xam.xex; write the handle to *r4
+    /// (not r3); and return NTSTATUS in r3 (`X_ERROR_NOT_FOUND` for
+    /// unknown names).
+    #[test]
+    fn xex_get_module_handle_distinguishes_modules() {
+        let (mut ctx, mem, mut state) = fresh();
+        state.image_base = 0x8200_0000;
+        let out_ptr = SCRATCH_BASE + 0x700;
+        let scratch_str = SCRATCH_BASE + 0x780;
+
+        let mut call = |name: Option<&str>,
+                        st: &mut KernelState,
+                        mem: &GuestMemory,
+                        ctx: &mut PpcContext|
+         -> (u64, u32) {
+            match name {
+                Some(s) => {
+                    for (i, b) in s.as_bytes().iter().enumerate() {
+                        mem.write_u8(scratch_str + i as u32, *b);
+                    }
+                    mem.write_u8(scratch_str + s.len() as u32, 0);
+                    ctx.gpr[3] = scratch_str as u64;
+                }
+                None => ctx.gpr[3] = 0,
+            }
+            ctx.gpr[4] = out_ptr as u64;
+            mem.write_u32(out_ptr, 0xDEAD_BEEF);
+            xex_get_module_handle(ctx, mem, st);
+            (ctx.gpr[3], mem.read_u32(out_ptr))
+        };
+
+        let (s_main, h_main) = call(Some(""), &mut state, &mem, &mut ctx);
+        let (s_krnl, h_krnl) = call(Some("xboxkrnl.exe"), &mut state, &mem, &mut ctx);
+        let (s_xam, h_xam) = call(Some("xam.xex"), &mut state, &mem, &mut ctx);
+        let (s_bad, h_bad) = call(Some("nope.xex"), &mut state, &mem, &mut ctx);
+
+        assert_eq!(s_main, 0);
+        assert_eq!(h_main, 0x8200_0000);
+        assert_eq!(s_krnl, 0);
+        assert_eq!(h_krnl, crate::state::HMODULE_XBOXKRNL);
+        assert_eq!(s_xam, 0);
+        assert_eq!(h_xam, crate::state::HMODULE_XAM);
+        assert_eq!(s_bad, 0x0000_048B);
+        assert_eq!(h_bad, 0, "out cleared on miss");
+        assert_ne!(h_main, h_krnl, "main module distinct from xboxkrnl");
+        assert_ne!(h_krnl, h_xam, "xboxkrnl distinct from xam");
+    }
+}
diff --git a/crates/xenia-kernel/src/interrupts.rs b/crates/xenia-kernel/src/interrupts.rs
new file mode 100644
index 0000000..f7d9477
--- /dev/null
+++ b/crates/xenia-kernel/src/interrupts.rs
@@ -0,0 +1,424 @@
+//! Graphics interrupt + synthetic v-sync bookkeeping (P6).
+//!
+//! The Xbox 360 graphics driver calls `VdSetGraphicsInterruptCallback` to
+//! register a single per-process callback that the OS invokes on:
+//!
+//! 1. **V-sync** — at 60 Hz; source code 0 (`INTERRUPT_SOURCE_VSYNC`).
+//! 2. **Command-processor interrupt** — when `PM4_INTERRUPT` fires from the
+//!    guest-issued command stream; source code 1 (`INTERRUPT_SOURCE_CP`).
+//!
+//! Canary's [xboxkrnl_video.cc:303-310](xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc#L303-L310)
+//! dispatches the callback on HW thread 0. We follow the same convention.
+//!
+//! The delivery model is cooperative: we inject the callback entry into HW
+//! thread 0 at the top of a scheduler round when it's safe (not mid-export,
+//! not already inside another interrupt). When the callback returns to
+//! [`LR_HALT_SENTINEL`] the main loop restores the saved [`PpcContext`]
+//! fields and the HW thread picks up where it left off.
+
+use std::collections::VecDeque;
+
+use xenia_cpu::context::{CrField, PpcContext};
+use xenia_cpu::ThreadRef;
+
+pub const INTERRUPT_SOURCE_VSYNC: u32 = 0;
+pub const INTERRUPT_SOURCE_CP: u32 = 1;
+
+/// Guest-registered V-sync / graphics-interrupt callback (from
+/// `VdSetGraphicsInterruptCallback`).
+#[derive(Debug, Clone, Copy)]
+pub struct GraphicsInterruptCallback {
+    pub callback_pc: u32,
+    pub user_data: u32,
+}
+
+/// Snapshot of the fields we mutate when diverting a HW thread into an
+/// interrupt callback. Restored when the callback returns to
+/// `LR_HALT_SENTINEL`.
+///
+/// We save **all PPC volatile registers** (r0, r2–r12) plus `r1` (SP),
+/// `pc`, `lr`, `ctr`, and `cr`. Non-volatile regs (r13–r31) are preserved
+/// by the callback's own `__savegprlr_N` prologue/epilogue per the PPC
+/// ELF ABI, so they don't need stashing here.
+///
+/// **SP (`gpr[1]`) is included because the injector decrements it by
+/// [`CALLBACK_STACK_PAD`] before the callback runs** — see that constant's
+/// docs for why. Without this, the callback's `__savegprlr_N` prologue
+/// overwrites the interrupted function's own stack-saved LR (which lives
+/// at `[r1 - 8]`), and when the interrupted function later tries to
+/// return, `bclr` jumps to `LR_HALT_SENTINEL` and the thread exits
+/// prematurely.
+#[derive(Debug, Clone, Copy)]
+pub struct SavedCallbackCtx {
+    pub pc: u32,
+    pub lr: u64,
+    pub ctr: u64,
+    /// All PPC volatile GPRs (r0, r2–r12) plus r1 (SP) in index order.
+    /// Index 0 = r0, 1 = r1, 2 = r2, …, 12 = r12. Index 13..32 unused.
+    pub gprs: [u64; 13],
+    pub cr: [CrField; 8],
+    pub source: u32,
+}
+
+/// Bytes the injector reserves below the interrupted thread's SP before
+/// running the ISR callback. Matches Canary's
+/// [`Processor::Execute`](../../../../xenia-canary/src/xenia/cpu/processor.cc#L383)
+/// which decrements `r[1]` by `64 + 112 = 176` before
+/// `function->Call(...)` and restores afterwards. The pad must be larger
+/// than any plausible sum of `__savegprlr_N`'s save-area (up to 64 B for
+/// r25-r31 + 8 B for LR) plus the callback's own `stwu r1,-N(r1)` frame
+/// (the Sylpheed vsync ISR uses 128 B).
+///
+/// Pre-fix: the ISR's `__savegprlr_25` stored the callback's saved LR
+/// (= `LR_HALT_SENTINEL`, from injection) at `[r1 - 8]` — exactly where
+/// the interrupted thread's current `bl`-saved LR lived. The
+/// interrupted function's return site got stomped with `SENTINEL`, so
+/// `__restgprlr_N -> bclr` jumped to the halt sentinel and the thread
+/// exited through the wrong path. Manifested in Sylpheed as tid=5
+/// (producer for the render queue) terminating at cycle 7.5M, starving
+/// both `0x10fc` (main's completion wait) and the PKEVENT that tid=6
+/// polls — no second `VdSwap`, no first pixel.
+pub const CALLBACK_STACK_PAD: u32 = 64 + 112;
+
+impl SavedCallbackCtx {
+    pub fn capture(ctx: &PpcContext, source: u32) -> Self {
+        let mut gprs = [0u64; 13];
+        for i in 0..13 {
+            gprs[i] = ctx.gpr[i];
+        }
+        Self {
+            pc: ctx.pc,
+            lr: ctx.lr,
+            ctr: ctx.ctr,
+            gprs,
+            cr: ctx.cr,
+            source,
+        }
+    }
+
+    pub fn restore(self, ctx: &mut PpcContext) {
+        ctx.pc = self.pc;
+        ctx.lr = self.lr;
+        ctx.ctr = self.ctr;
+        for i in 0..13 {
+            ctx.gpr[i] = self.gprs[i];
+        }
+        ctx.cr = self.cr;
+    }
+}
+
+/// Maximum pending sources held in the FIFO queue before new ones are
+/// dropped. Four is enough to absorb a short burst (a few v-syncs arriving
+/// while HW 0 is mid-callback from a prior one) without letting runaway
+/// delivery swamp the guest.
+pub const INTERRUPT_QUEUE_CAP: usize = 4;
+
+/// All interrupt bookkeeping — single field on `KernelState`.
+///
+/// **First-Pixels M2 (2026-04-20)** — changed from a single-slot
+/// `pending_source: Option<u32>` coalesce to a bounded FIFO so bursts
+/// don't drop silently, and dropped `VSYNC_INSTR_PERIOD` from 500k to
+/// 150k so cadence approximates 60 Hz at the current ~10 MIPS interpreter
+/// throughput. Combined with the `HwState::ServicingIrq` variant added to
+/// `xenia-cpu::scheduler`, interrupts can now be delivered even when HW 0
+/// is `Blocked(WaitAny)` — the injector stashes the block into the new
+/// variant and the restore path re-blocks when the callback returns,
+/// unless a `wake()` during the callback resolved the wait.
+/// M2.5 — per-slot pending-IRQ bitmask. Each `AtomicU8` holds one bit per
+/// interrupt source (currently 2 sources: VSYNC=bit 0, CP=bit 1) destined
+/// for that specific HW slot. Used by the M3 parallel path: T_main (or
+/// the GPU thread) sets a bit Release on the target slot's atomic; the
+/// target T_cpu_i checks the bit Acquire at its quantum boundary and
+/// self-injects without taking another thread's slot lock.
+///
+/// The 6-element fixed-size array mirrors `xenia_cpu::scheduler::HW_THREAD_COUNT`.
+pub type PendingLocalIrq = [std::sync::atomic::AtomicU8;
+    xenia_cpu::scheduler::HW_THREAD_COUNT];
+
+#[derive(Debug, Default)]
+pub struct InterruptState {
+    /// Registered callback (set by `VdSetGraphicsInterruptCallback`).
+    pub callback: Option<GraphicsInterruptCallback>,
+    /// Bounded FIFO of pending interrupt sources awaiting injection.
+    /// Push-back on queue, pop-front on inject. Over-cap pushes drop.
+    pub pending: VecDeque<u32>,
+    /// When `Some`, some HW thread is currently running a callback; on
+    /// return-to-sentinel we restore this and clear the flag.
+    pub saved: Option<SavedCallbackCtx>,
+    /// Which guest thread the current callback was injected into.
+    /// Required because we no longer anchor delivery to HW 0 — any
+    /// non-Exited thread is a valid target. Meaningful only while
+    /// `saved.is_some()`. Stored as a `ThreadRef` so per-slot
+    /// runqueues don't get ambiguous addressing.
+    pub injected_ref: Option<ThreadRef>,
+    /// Monotonic count of delivered interrupts.
+    pub delivered: u64,
+    /// Dropped interrupts (callback unset, queue full, or thread
+    /// exited/idle at inject time).
+    pub dropped: u64,
+    /// Instruction-count accumulator for the synthetic v-sync ticker. At
+    /// `VSYNC_INSTR_PERIOD` the main loop pushes an `INTERRUPT_SOURCE_VSYNC`
+    /// onto `pending` and resets.
+    pub vsync_accumulator: u64,
+    /// Last observed instruction count — `tick_vsync` diffs against
+    /// this to advance `vsync_accumulator`.
+    pub last_instr_count: u64,
+    /// M2.5 — per-slot pending-IRQ bits. Set by the producer (M3's
+    /// IRQ-routing logic on `T_main`) with `Release`; consumed by the
+    /// target T_cpu_i with `Acquire` at quantum boundary. Unused under
+    /// the lockstep path (M2's single-host-thread model still uses
+    /// `pending` + `try_inject_graphics_interrupt`); the field is wired
+    /// here so M3's per-HW-thread path is a flag flip, not a refactor.
+    pub pending_local_irq: PendingLocalIrq,
+}
+
+/// How many guest instructions correspond to one synthetic v-sync.
+///
+/// Targets **~60 Hz at the post-Tier-3 interpreter throughput (~10 MIPS)**:
+/// 10e6 instr/s ÷ 60 Hz ≈ 167k — we use 150k to give a small cushion.
+/// Before M2 this was 500k (~20 Hz), which was enough for games that
+/// don't gate anything on v-sync but not enough for titles like Sylpheed
+/// whose main loop waits on the v-sync callback to signal an event every
+/// frame.
+pub const VSYNC_INSTR_PERIOD: u64 = 150_000;
+
+impl InterruptState {
+    /// Record a new callback registration.
+    pub fn set_callback(&mut self, callback_pc: u32, user_data: u32) {
+        self.callback = Some(GraphicsInterruptCallback {
+            callback_pc,
+            user_data,
+        });
+    }
+
+    /// Queue an interrupt for the next safe injection point.
+    pub fn queue_interrupt(&mut self, source: u32) {
+        if self.callback.is_none() {
+            self.dropped += 1;
+            return;
+        }
+        if self.pending.len() >= INTERRUPT_QUEUE_CAP {
+            self.dropped += 1;
+            return;
+        }
+        self.pending.push_back(source);
+    }
+
+    /// Peek at the next pending source without removing it.
+    pub fn peek_next(&self) -> Option<u32> {
+        self.pending.front().copied()
+    }
+
+    /// Pop the next pending source (called by the injector after it has
+    /// committed to dispatching it).
+    pub fn take_next(&mut self) -> Option<u32> {
+        self.pending.pop_front()
+    }
+
+    /// Advance the v-sync accumulator by the delta since the last call.
+    /// Returns `true` if a new v-sync was queued.
+    pub fn tick_vsync(&mut self, current_instr_count: u64) -> bool {
+        let delta = current_instr_count.saturating_sub(self.last_instr_count);
+        self.last_instr_count = current_instr_count;
+        self.vsync_accumulator = self.vsync_accumulator.saturating_add(delta);
+        if self.vsync_accumulator < VSYNC_INSTR_PERIOD {
+            return false;
+        }
+        // Multiple periods may have elapsed in a single tick call if a
+        // large instruction delta went by (e.g. a long export). Drain
+        // the accumulator fully so we don't lag behind.
+        let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
+        self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
+        for _ in 0..periods {
+            self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        }
+        true
+    }
+
+    /// Is HW thread 0 currently in a callback?
+    pub fn is_in_callback(&self) -> bool {
+        self.saved.is_some()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn queue_interrupt_drops_without_callback() {
+        let mut s = InterruptState::default();
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        assert_eq!(s.dropped, 1);
+        assert!(s.pending.is_empty());
+    }
+
+    #[test]
+    fn queue_interrupt_fifo_preserves_order() {
+        let mut s = InterruptState::default();
+        s.set_callback(0x1000, 0xAB);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        s.queue_interrupt(INTERRUPT_SOURCE_CP);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        assert_eq!(s.dropped, 0);
+        // FIFO: take_next hands them out in push order.
+        assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
+        assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_CP));
+        assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
+        assert_eq!(s.take_next(), None);
+    }
+
+    #[test]
+    fn queue_interrupt_caps_at_queue_size() {
+        let mut s = InterruptState::default();
+        s.set_callback(0x1000, 0xAB);
+        for _ in 0..INTERRUPT_QUEUE_CAP {
+            s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        }
+        // Over-cap: drops rather than evicting the oldest.
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        assert_eq!(s.dropped, 2);
+        assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
+    }
+
+    #[test]
+    fn tick_vsync_fires_at_new_150k_threshold() {
+        let mut s = InterruptState::default();
+        s.set_callback(0x1000, 0xAB);
+        assert_eq!(VSYNC_INSTR_PERIOD, 150_000);
+        assert!(!s.tick_vsync(VSYNC_INSTR_PERIOD - 1));
+        assert!(s.pending.is_empty());
+        assert!(s.tick_vsync(VSYNC_INSTR_PERIOD));
+        assert_eq!(s.peek_next(), Some(INTERRUPT_SOURCE_VSYNC));
+    }
+
+    #[test]
+    fn tick_vsync_drains_multiple_periods_in_one_call() {
+        // Long kernel export → big instr delta → multiple v-syncs must
+        // be delivered, not lost.
+        let mut s = InterruptState::default();
+        s.set_callback(0x1000, 0xAB);
+        assert!(s.tick_vsync(VSYNC_INSTR_PERIOD * 3 + 10));
+        assert_eq!(s.pending.len(), 3);
+    }
+
+    /// Simulates what the main loop does: inject, execute guest code up
+    /// to the sentinel, restore. Uses a single-instruction `bclr` callback
+    /// — the interpreter sees `pc == callback_pc`, steps, and the blr
+    /// instruction writes `lr` into `pc`, which equals `LR_HALT_SENTINEL`
+    /// → main loop detects and triggers restore.
+    #[test]
+    fn inject_restore_roundtrip_smoke() {
+        let mut ctx = PpcContext::new();
+        ctx.pc = 0x1000_0000;
+        ctx.lr = 0xCAFE_BABE;
+        ctx.gpr[3] = 0x1234;
+        ctx.gpr[4] = 0x5678;
+
+        let mut s = InterruptState::default();
+        s.set_callback(0x2000_0000, 0xDEAD);
+
+        // Simulate main loop inject: save ctx fields, divert pc/lr/r3/r4.
+        let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
+        s.saved = Some(saved);
+        ctx.pc = 0x2000_0000;
+        ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
+        ctx.gpr[3] = INTERRUPT_SOURCE_VSYNC as u64;
+        ctx.gpr[4] = 0xDEAD;
+        assert!(s.is_in_callback());
+
+        // Guest callback "runs" to the sentinel — simulate by writing
+        // pc = lr (what `blr` would do).
+        ctx.pc = ctx.lr as u32;
+
+        // Main loop detects pc == LR_HALT_SENTINEL while in_callback:
+        let saved = s.saved.take().unwrap();
+        saved.restore(&mut ctx);
+        s.delivered += 1;
+
+        assert_eq!(ctx.pc, 0x1000_0000);
+        assert_eq!(ctx.lr, 0xCAFE_BABE);
+        assert_eq!(ctx.gpr[3], 0x1234);
+        assert_eq!(ctx.gpr[4], 0x5678);
+        assert!(!s.is_in_callback());
+        assert_eq!(s.delivered, 1);
+    }
+
+    #[test]
+    fn saved_ctx_roundtrip() {
+        let mut ctx = PpcContext::new();
+        ctx.pc = 0x11223344;
+        ctx.lr = 0xDEADBEEF;
+        ctx.gpr[3] = 0xAAAA;
+        ctx.gpr[4] = 0xBBBB;
+        let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
+        ctx.pc = 0;
+        ctx.lr = 0;
+        ctx.gpr[3] = 0;
+        ctx.gpr[4] = 0;
+        saved.restore(&mut ctx);
+        assert_eq!(ctx.pc, 0x11223344);
+        assert_eq!(ctx.lr, 0xDEADBEEF);
+        assert_eq!(ctx.gpr[3], 0xAAAA);
+        assert_eq!(ctx.gpr[4], 0xBBBB);
+    }
+
+    /// Full volatile-GPR + SP roundtrip. Regression test for the
+    /// 2026-04-24 IRQ-injection fix: the ISR callback's prologue clobbers
+    /// `[r1 - 8]` on the interrupted thread's stack unless the injector
+    /// pre-decrements SP by [`CALLBACK_STACK_PAD`] and the saved ctx puts
+    /// SP (and the rest of the PPC volatile set) back on return.
+    #[test]
+    fn saved_ctx_covers_sp_and_all_volatile_gprs() {
+        let mut ctx = PpcContext::new();
+        ctx.pc = 0xAAAA_BBBB;
+        ctx.lr = 0x1111_2222;
+        ctx.ctr = 0x3333_4444;
+        for i in 0..13 {
+            ctx.gpr[i] = 0x1000 + i as u64;
+        }
+        // r13..r31 are non-volatile and should survive the callback's own
+        // save/restore — the saved ctx deliberately does NOT cover them.
+        for i in 13..32 {
+            ctx.gpr[i] = 0xDEAD_0000 + i as u64;
+        }
+
+        let saved = SavedCallbackCtx::capture(&ctx, INTERRUPT_SOURCE_VSYNC);
+
+        // Simulate injector: flip pc/lr/r1/r3/r4 (what the real injector
+        // actually does — see try_inject_graphics_interrupt in main.rs).
+        ctx.pc = 0xCAFE;
+        ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
+        ctx.gpr[1] = ctx.gpr[1].wrapping_sub(CALLBACK_STACK_PAD as u64);
+        ctx.gpr[3] = INTERRUPT_SOURCE_VSYNC as u64;
+        ctx.gpr[4] = 0xBEEF;
+        // Simulate callback clobbering a few volatile regs that aren't
+        // part of the "obviously diverted" set.
+        ctx.gpr[0] = 0xFEED_FACE;
+        ctx.gpr[7] = 0x9999;
+        ctx.gpr[12] = 0xABCD;
+
+        saved.restore(&mut ctx);
+
+        // All volatile GPRs restored to pre-injection.
+        for i in 0..13 {
+            assert_eq!(
+                ctx.gpr[i],
+                0x1000 + i as u64,
+                "volatile r{} clobbered by callback was not restored",
+                i
+            );
+        }
+        // SP specifically back to the pre-pad value.
+        assert_eq!(ctx.gpr[1], 0x1001, "SP must be restored to pre-injection");
+        // Non-volatile regs were never captured; they stay as the callback
+        // left them (here, untouched because we didn't modify 13..32).
+        for i in 13..32 {
+            assert_eq!(ctx.gpr[i], 0xDEAD_0000 + i as u64);
+        }
+        assert_eq!(ctx.pc, 0xAAAA_BBBB);
+        assert_eq!(ctx.lr, 0x1111_2222);
+        assert_eq!(ctx.ctr, 0x3333_4444);
+    }
+}
diff --git a/crates/xenia-kernel/src/lib.rs b/crates/xenia-kernel/src/lib.rs
index 457b815..9cdceb9 100644
--- a/crates/xenia-kernel/src/lib.rs
+++ b/crates/xenia-kernel/src/lib.rs
@@ -1,6 +1,17 @@
+pub mod audit;
 pub mod exports;
+pub mod interrupts;
 pub mod objects;
+pub mod path;
 pub mod state;
+pub mod thread;
+pub mod ui_bridge;
 pub mod xam;
 
+pub use interrupts::{
+    GraphicsInterruptCallback, InterruptState, SavedCallbackCtx, INTERRUPT_SOURCE_CP,
+    INTERRUPT_SOURCE_VSYNC, VSYNC_INSTR_PERIOD,
+};
 pub use state::{KernelState, ModuleId};
+pub use thread::{allocate_thread_image, ThreadImage};
+pub use ui_bridge::{SwapInfo, UiBridge};
diff --git a/crates/xenia-kernel/src/objects.rs b/crates/xenia-kernel/src/objects.rs
index 117399b..434ba34 100644
--- a/crates/xenia-kernel/src/objects.rs
+++ b/crates/xenia-kernel/src/objects.rs
@@ -1,12 +1,94 @@
 //! Kernel object tracking for HLE.
 
+use std::sync::Arc;
+
+use xenia_cpu::ThreadRef;
+
 /// Kernel object types tracked by handle.
+///
+/// Sync variants (`Event`, `Semaphore`, `Mutex`, `Thread`) carry an in-place
+/// waiter list so wait/set/release sites keep invariants local — dropping the
+/// object implicitly drops its waiters. Waiters are stored as `ThreadRef`
+/// (post-Axis-1) — a bare `hw_id: u8` would have been ambiguous under per-slot
+/// runqueues where multiple guest threads share one HW slot.
 #[derive(Debug)]
 pub enum KernelObject {
-    Event { manual_reset: bool, signaled: bool },
-    Semaphore { count: i32, max: i32 },
-    File { path: String },
-    Thread { id: u32 },
-    Timer,
-    Mutex,
+    Event {
+        manual_reset: bool,
+        signaled: bool,
+        /// Guest threads parked on this event.
+        waiters: Vec<ThreadRef>,
+    },
+    Semaphore {
+        count: i32,
+        max: i32,
+        waiters: Vec<ThreadRef>,
+    },
+    File {
+        /// Normalized VFS path (e.g. "default.xex", "media/shared/foo.pkg").
+        path: String,
+        /// Full file size in bytes.
+        size: u64,
+        /// Current read/write cursor.
+        position: u64,
+        /// Whole-file buffer — VFS reads the entire file up front so
+        /// subsequent NtReadFile calls are O(1) slice copies.
+        /// `Arc<Vec<u8>>` so duplicate handles could share backing storage.
+        data: Arc<Vec<u8>>,
+        /// Directory-enumeration cursor consumed by `NtQueryDirectoryFile`.
+        /// `None` before the first call; `Some(N)` = next VFS entry index
+        /// to emit. Reset to `Some(0)` when the guest passes
+        /// `restart_scan=1`. Unused on non-directory files.
+        dir_enum_pos: Option<usize>,
+    },
+    Thread {
+        id: u32,
+        /// HW thread slot currently running this guest thread (None once exited
+        /// — `exit_code` becomes Some).
+        hw_id: Option<u8>,
+        /// None while the thread is running; populated on ExTerminateThread
+        /// or halt-sentinel return.
+        exit_code: Option<u32>,
+        /// Guest threads parked in KeWaitForSingleObject on this thread handle.
+        waiters: Vec<ThreadRef>,
+    },
+    Timer {
+        /// Xbox 360 timer_type 0 = NotificationTimer (manual-reset),
+        /// 1 = SynchronizationTimer (auto-reset). Same shape as Event.
+        manual_reset: bool,
+        signaled: bool,
+        /// Absolute tick-space deadline; None when disarmed.
+        deadline: Option<u64>,
+        /// Period in ticks (same units as `deadline`); 0 = one-shot.
+        period_ticks: u64,
+        /// Original ms value (canary's SetTimer keeps it for diagnostics).
+        period_ms: u32,
+        /// APC routine (deferred — see `timer_apc` warn in nt_set_timer_ex).
+        callback_routine: u32,
+        callback_arg: u32,
+        waiters: Vec<ThreadRef>,
+    },
+    Mutex {
+        /// HW thread id currently holding the mutex; None when free.
+        owner: Option<u8>,
+        recursion: u32,
+        waiters: Vec<ThreadRef>,
+    },
+}
+
+impl KernelObject {
+    /// Returns the per-object waiter list for the 5 sync variants (Event,
+    /// Semaphore, Thread, Timer, Mutex) and `None` for `File`. Used by
+    /// deadline-expiry scrub in `KernelState::handle_timeout_wake` so a
+    /// timed-out waiter isn't left stranded in a handle's waiters list.
+    pub fn waiters_mut(&mut self) -> Option<&mut Vec<ThreadRef>> {
+        match self {
+            KernelObject::Event { waiters, .. }
+            | KernelObject::Semaphore { waiters, .. }
+            | KernelObject::Thread { waiters, .. }
+            | KernelObject::Timer { waiters, .. }
+            | KernelObject::Mutex { waiters, .. } => Some(waiters),
+            KernelObject::File { .. } => None,
+        }
+    }
 }
diff --git a/crates/xenia-kernel/src/path.rs b/crates/xenia-kernel/src/path.rs
new file mode 100644
index 0000000..edb7e7d
--- /dev/null
+++ b/crates/xenia-kernel/src/path.rs
@@ -0,0 +1,139 @@
+//! Path normalization for kernel file I/O.
+//!
+//! Guests pass file paths inside an `OBJECT_ATTRIBUTES` struct that points at
+//! an `ANSI_STRING` descriptor. Those paths come in several Xbox-flavored
+//! forms — NT device paths (`\Device\Cdrom0\...`), drive letters (`D:\...`,
+//! `d:\...`), or symbolic link prefixes (`game:\...`). We strip whichever
+//! prefix applies and return a plain slash-separated path relative to the
+//! mounted VFS root, so `VfsDevice::read_file` can look it up directly.
+
+use xenia_memory::{GuestMemory, MemoryAccess};
+
+/// Xbox `ANSI_STRING`:
+///   u16 Length
+///   u16 MaximumLength
+///   u32 Buffer (guest pointer)
+fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
+    if ptr == 0 {
+        return None;
+    }
+    let length = mem.read_u16(ptr) as u32;
+    let buffer = mem.read_u32(ptr + 4);
+    if buffer == 0 || length == 0 {
+        return Some(String::new());
+    }
+    let mut out = String::with_capacity(length as usize);
+    for i in 0..length {
+        let c = mem.read_u8(buffer + i);
+        if c == 0 {
+            break;
+        }
+        out.push(c as char);
+    }
+    Some(out)
+}
+
+/// Xbox `OBJECT_ATTRIBUTES`:
+///   u32 RootDirectory (handle)
+///   u32 Name          (pointer to ANSI_STRING)
+///   u32 Attributes
+fn read_object_attributes_name(mem: &GuestMemory, obj_attrs_ptr: u32) -> Option<String> {
+    if obj_attrs_ptr == 0 {
+        return None;
+    }
+    let name_ptr = mem.read_u32(obj_attrs_ptr + 4);
+    read_ansi_string(mem, name_ptr)
+}
+
+/// Known Xbox device prefixes that need to be stripped before looking a path
+/// up in the VFS. The list mirrors the symbolic links xenia-canary sets up
+/// at boot (see `xboxkrnl_io.cc`). Case-insensitive matching.
+const DEVICE_PREFIXES: &[&str] = &[
+    "\\Device\\Cdrom0\\",
+    "\\Device\\Harddisk0\\Partition1\\",
+    "\\Device\\Harddisk0\\Partition0\\",
+    "\\Device\\Harddisk0\\",
+    "\\Device\\Mu0\\",
+    "\\Device\\Mu1\\",
+    "\\Device\\Mass0\\",
+    "\\Device\\Mass1\\",
+    "\\Device\\Mass2\\",
+    "\\SystemRoot\\",
+    "\\??\\",
+    "game:\\",
+    "d:\\",
+    "D:\\",
+];
+
+/// Strip any Xbox device prefix and normalize backslashes to forward slashes.
+/// Returns the path relative to the VFS root.
+pub fn normalize_path(raw: &str) -> String {
+    let mut s = raw.trim().to_string();
+
+    // Case-insensitive prefix strip.
+    let lowered = s.to_ascii_lowercase();
+    for prefix in DEVICE_PREFIXES {
+        let pl = prefix.to_ascii_lowercase();
+        if lowered.starts_with(&pl) {
+            s = s[pl.len()..].to_string();
+            break;
+        }
+    }
+
+    // Drop any leading slash/backslash that survived prefix stripping.
+    while s.starts_with('\\') || s.starts_with('/') {
+        s.remove(0);
+    }
+
+    // Canonical form: forward slashes.
+    s.replace('\\', "/")
+}
+
+/// Convenience: read the OBJECT_ATTRIBUTES struct at `obj_attrs_ptr` and
+/// return a normalized VFS path. Returns `None` if the struct pointer or its
+/// inner name pointer is null.
+pub fn object_attributes_to_vfs_path(mem: &GuestMemory, obj_attrs_ptr: u32) -> Option<String> {
+    let raw = read_object_attributes_name(mem, obj_attrs_ptr)?;
+    if raw.is_empty() {
+        return None;
+    }
+    Some(normalize_path(&raw))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn strips_device_cdrom() {
+        assert_eq!(normalize_path("\\Device\\Cdrom0\\default.xex"), "default.xex");
+    }
+
+    #[test]
+    fn strips_drive_letter_lowercase() {
+        assert_eq!(normalize_path("d:\\media\\shared\\foo.pkg"), "media/shared/foo.pkg");
+    }
+
+    #[test]
+    fn strips_drive_letter_uppercase() {
+        assert_eq!(normalize_path("D:\\default.xex"), "default.xex");
+    }
+
+    #[test]
+    fn strips_game_prefix() {
+        assert_eq!(normalize_path("game:\\data\\whatever.bin"), "data/whatever.bin");
+    }
+
+    #[test]
+    fn preserves_relative_path() {
+        assert_eq!(normalize_path("scripts/init.lua"), "scripts/init.lua");
+    }
+
+    #[test]
+    fn handles_partition1() {
+        assert_eq!(
+            normalize_path("\\Device\\Harddisk0\\Partition1\\content\\abc.sav"),
+            "content/abc.sav"
+        );
+    }
+}
diff --git a/crates/xenia-kernel/src/state.rs b/crates/xenia-kernel/src/state.rs
index cc47e4e..c09dc63 100644
--- a/crates/xenia-kernel/src/state.rs
+++ b/crates/xenia-kernel/src/state.rs
@@ -1,11 +1,35 @@
 use std::collections::HashMap;
-use xenia_cpu::PpcContext;
-use xenia_memory::GuestMemory;
+use xenia_cpu::scheduler::{PcrWriter, Scheduler};
+use xenia_cpu::{PpcContext, ThreadRef};
+use xenia_memory::{GuestMemory, MemoryAccess};
+use xenia_vfs::VfsDevice;
 
+use crate::audit::{HandleAudit, HandleAuditEntry};
 use crate::objects::KernelObject;
+use crate::ui_bridge::UiBridge;
+
+/// Adapter: write PCR+0x2C on guest memory. Lets `Scheduler::spawn` and
+/// Axis 4's migration call through without `xenia-cpu` depending on the
+/// memory crate.
+pub struct GuestMemoryPcr<'a>(pub &'a GuestMemory);
+impl PcrWriter for GuestMemoryPcr<'_> {
+    fn write_pcr_id(&mut self, pcr_base: u32, hw_id: u8) {
+        // `GuestMemory::write_u32` takes `&self` post-M2 trait flip; the
+        // wrapping `&'a GuestMemory` is sufficient.
+        self.0.write_u32(pcr_base + 0x2C, hw_id as u32);
+    }
+}
 
 /// Function signature for HLE kernel exports.
-pub type KernelExportFn = fn(&mut PpcContext, &mut GuestMemory, &mut KernelState);
+///
+/// The first argument is the **currently running** HW thread's `PpcContext`,
+/// which the caller has temporarily moved out of the scheduler slot to avoid
+/// aliasing. Exports that only touch register/GPR state use `ctx` directly;
+/// exports that need scheduler state (spawn/park/wake/tls/etc.) reach
+/// through `state.scheduler` — note that `state.scheduler.hw_threads[current]`
+/// holds a placeholder `PpcContext` for the duration of the call, not the
+/// live one passed as `ctx`.
+pub type KernelExportFn = fn(&mut PpcContext, &GuestMemory, &mut KernelState);
 
 /// Module identifier for kernel exports.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -15,45 +39,174 @@ pub enum ModuleId {
     Xbdm,
 }
 
+/// Pseudo-`HMODULE` values returned by `XexGetModuleHandle` and accepted by
+/// `XexGetProcedureAddress`. Distinct from real loaded-image bases
+/// (>=0x82000000) and from kernel handles (0x1000+, allocated by
+/// `alloc_handle`). The 0xFFFE_xxxx prefix is unused by both guest segments
+/// and our handle allocator.
+pub const HMODULE_XBOXKRNL: u32 = 0xFFFE_0001;
+pub const HMODULE_XAM: u32 = 0xFFFE_0002;
+
 /// Central kernel state tracking all guest OS state.
 pub struct KernelState {
     exports: HashMap<(ModuleId, u32), (&'static str, KernelExportFn)>,
-    next_handle: u32,
-    pub tls_slots: HashMap<u32, u64>,
-    next_tls_index: u32,
+    /// M2.4: bump allocator for kernel handles. `AtomicU32` so concurrent
+    /// HLE calls under M3 can `fetch_add` without a lock. `Relaxed` is
+    /// fine — the allocated value is a fresh ID with no prior payload to
+    /// publish; observers (the kernel object table) are guarded by
+    /// their own synchronization.
+    next_handle: std::sync::atomic::AtomicU32,
+    /// Scheduler managing all emulated HW threads + their per-slot
+    /// runqueues. Starts empty — the app installs the initial guest thread
+    /// on slot 0 via `KernelState::install_initial_thread` once it has the
+    /// entry address.
+    pub scheduler: Scheduler,
+    /// TLS slot allocator — index counter only. Per-thread *values* live on
+    /// `GuestThread::tls_values` (see scheduler). M2.4: `AtomicU32`.
+    pub next_tls_index: std::sync::atomic::AtomicU32,
+    /// Critical-section waiter map: guest `cs_ptr` → guest threads parked
+    /// on it. Critical sections are in guest memory (not kernel objects),
+    /// so their waiter list lives here rather than on an object.
+    pub cs_waiters: HashMap<u32, Vec<ThreadRef>>,
     /// Kernel object table: handle → object
     pub objects: HashMap<u32, KernelObject>,
-    /// Bump allocator for guest heap (NtAllocateVirtualMemory etc.)
-    pub heap_cursor: u32,
-    /// Stack allocator cursor for MmCreateKernelStack
-    pub stack_cursor: u32,
+    /// Bump allocator for guest heap (NtAllocateVirtualMemory etc.).
+    /// M2.4: `AtomicU32` for lock-free concurrent allocation.
+    pub heap_cursor: std::sync::atomic::AtomicU32,
+    /// Stack allocator cursor for MmCreateKernelStack. M2.4: atomic.
+    pub stack_cursor: std::sync::atomic::AtomicU32,
     /// GPU command buffer address (set by VdGetSystemCommandBuffer)
     pub gpu_command_buffer: u32,
+    /// GPU backend. M1.4: was `xenia_gpu::GpuSystem` directly, now a
+    /// [`xenia_gpu::GpuBackend`] enum so the kernel can hold either an
+    /// inline `GpuSystem` (synchronous, default) or a `GpuHandle` proxy
+    /// pointing at a worker thread (`--gpu-thread`). Forwarding methods
+    /// on the enum keep call sites in [`crate::exports`] terse.
+    pub gpu: xenia_gpu::GpuBackend,
+    /// Monotonic packet number returned by `XamInputGetState`. Games detect
+    /// input changes by watching this increment.
+    pub input_packet_number: u32,
+    /// Previous gamepad snapshot; `input_packet_number` only advances when
+    /// the state bytes actually change, matching host XInput semantics.
+    pub last_input_bytes: u128,
     /// Image base of the loaded XEX (for XexExecutableModuleHandle etc.)
     pub image_base: u32,
-    /// Next thread ID
-    pub next_thread_id: u32,
+    /// Next thread ID. M2.4: atomic.
+    pub next_thread_id: std::sync::atomic::AtomicU32,
+    /// Virtual file system for NtCreateFile/NtReadFile/etc. The app mounts
+    /// the disc image or host directory into this slot; file I/O handlers
+    /// route all reads through it.
+    pub vfs: Option<Box<dyn VfsDevice>>,
+    /// Bridge to the host UI. `None` when running headless. Installed by
+    /// `cmd_exec` when the user passes `--ui`.
+    pub ui: Option<UiBridge>,
+    /// P6 — graphics interrupt + synthetic v-sync bookkeeping. Registers
+    /// the callback set by `VdSetGraphicsInterruptCallback` and tracks
+    /// the paused-context snapshot while HW thread 0 is running it.
+    pub interrupts: crate::interrupts::InterruptState,
+    /// Per-handle refcount. Since `NtDuplicateObject` aliases (returns the
+    /// source handle value as the "new" handle rather than minting a fresh
+    /// id), a single handle commonly has multiple logical references. This
+    /// map tracks that count so a stray `NtClose` on one reference doesn't
+    /// destroy the object while another reference is still live. Canary's
+    /// `ObjectTable::ReleaseHandle` (object_table.cc:189) is the parity
+    /// reference. Initialized to 1 in `alloc_handle_for`; incremented in
+    /// `nt_duplicate_object` when `DUPLICATE_CLOSE_SOURCE` is absent;
+    /// decremented in `nt_close` which drops the underlying object only
+    /// when the count reaches zero.
+    pub handle_refcount: HashMap<u32, u32>,
+    /// Pending timer expirations — `(deadline, handle)` sorted ascending by
+    /// deadline. Pushed by `arm_timer`, popped by `fire_due_timers`. Kept in
+    /// lockstep with the per-`Timer` object's `deadline` field via the
+    /// `arm_timer`/`disarm_timer` helpers. See the plan's step 3/6 for the
+    /// design rationale — timer deadlines coexist with
+    /// `Scheduler::timed_waits` but track a different class (signaled object
+    /// fires, not thread wake-ups).
+    pub pending_timer_fires: Vec<(u64, u32)>,
+    /// Per-handle signal/wait/wake audit trail. Default `enabled=false` →
+    /// every record method is a no-op. Flip via `--trace-handles`/
+    /// `XENIA_TRACE_HANDLES` to diagnose missing-signal deadlocks (handles
+    /// 0x10FC / 0x1014 / 0x1104 / 0x10DC / 0x10F0 specifically). See
+    /// [`crate::audit`] for layout.
+    pub audit: HandleAudit,
+    /// M2.2 — banked reservation table for `lwarx`/`stwcx.` under M3's
+    /// per-HW-thread parallelism. Always allocated. Consulted by the
+    /// interpreter when `reservations.is_enabled()` is true; otherwise
+    /// the legacy per-`PpcContext` fields drive observable behavior.
+    /// Settable via `--reservations-table` / `XENIA_RESERVATIONS_TABLE=1`
+    /// for golden verification, or implicitly under `--parallel`.
+    /// See [`xenia_cpu::ReservationTable`] for the concurrency model.
+    pub reservations: std::sync::Arc<xenia_cpu::ReservationTable>,
+    /// Map from `(module, ordinal)` to the guest-side import-thunk address
+    /// resolved at load time. Reverse of `xenia-app/src/main.rs`'s
+    /// `thunk_map`. Populated from xenia-app's Phase 1 (record_type==1
+    /// only). Used by `xex_get_procedure_address` to resolve ordinals back
+    /// to callable thunks.
+    thunks_by_ordinal: HashMap<(ModuleId, u16), u32>,
+    /// First-Pixels diagnostic latch. Set the first time
+    /// `RtlRaiseException` fires with code `0xE06D7363` (MSVC C++ throw)
+    /// so the deep stack-walk + `runtime_error` decode in
+    /// `rtl_raise_exception` only emits once per run, regardless of how
+    /// many subsequent throws fire. Reset on each fresh process start.
+    pub cxx_throw_logged: bool,
 }
 
 impl KernelState {
-    pub fn new() -> Self {
+    /// Construct a kernel with the supplied GPU backend.
+    ///
+    /// The caller (typically `cmd_exec_inner`) decides whether to install
+    /// an inline backend (default) or a threaded one (`--gpu-thread`).
+    /// Most existing call sites build via [`Self::new`], which defaults to
+    /// an inline backend; the threaded constructor lives at
+    /// [`Self::with_gpu`].
+    pub fn with_gpu(gpu: xenia_gpu::GpuBackend) -> Self {
+        // Scheduler starts empty; the app installs the initial thread on
+        // slot 0 via `install_initial_thread` right after construction.
+        let mut scheduler = Scheduler::new();
+        use std::sync::atomic::AtomicU32;
+        let reservations = std::sync::Arc::new(xenia_cpu::ReservationTable::new());
+        // M3.7 — wire the reservation table to the scheduler so
+        // `spawn`/`install_initial_thread` populate every PpcContext's
+        // `reservation_table` clone. The table is `disabled` by
+        // default; `--reservations-table` / `XENIA_RESERVATIONS_TABLE`
+        // / M3 spawn flip it on.
+        scheduler.set_reservation_table(Some(reservations.clone()));
         let mut state = Self {
             exports: HashMap::new(),
-            next_handle: 0x1000,
-            tls_slots: HashMap::new(),
-            next_tls_index: 0,
+            next_handle: AtomicU32::new(0x1000),
+            scheduler,
+            next_tls_index: AtomicU32::new(0),
+            cs_waiters: HashMap::new(),
             objects: HashMap::new(),
-            heap_cursor: 0x4000_0000, // Start of user heap region
-            stack_cursor: 0x7100_0000, // Above main stack
+            heap_cursor: AtomicU32::new(0x4000_0000), // Start of user heap region
+            stack_cursor: AtomicU32::new(0x7100_0000), // Above main stack
             gpu_command_buffer: 0,
+            gpu,
+            input_packet_number: 0,
+            last_input_bytes: 0,
             image_base: 0,
-            next_thread_id: 1,
+            next_thread_id: AtomicU32::new(1),
+            vfs: None,
+            ui: None,
+            interrupts: crate::interrupts::InterruptState::default(),
+            handle_refcount: HashMap::new(),
+            pending_timer_fires: Vec::new(),
+            audit: HandleAudit::default(),
+            reservations,
+            thunks_by_ordinal: HashMap::new(),
+            cxx_throw_logged: false,
         };
         crate::exports::register_exports(&mut state);
         crate::xam::register_exports(&mut state);
         state
     }
 
+    /// Default constructor — installs an inline `GpuSystem`. Kept for
+    /// callers that don't (yet) thread a `GpuBackend` choice through.
+    pub fn new() -> Self {
+        Self::with_gpu(xenia_gpu::GpuBackend::Inline(xenia_gpu::GpuSystem::new()))
+    }
+
     pub fn register_export(
         &mut self,
         module: ModuleId,
@@ -64,31 +217,159 @@ impl KernelState {
         self.exports.insert((module, ordinal), (name, func));
     }
 
+    /// Record an import-thunk address resolved at load time. Called once
+    /// per `record_type==1` import in xenia-app's Phase 1. Idempotent: a
+    /// duplicate ordinal overwrites (later wins; in practice the loader
+    /// emits each ordinal once per module).
+    pub fn register_thunk(&mut self, module: ModuleId, ordinal: u16, address: u32) {
+        self.thunks_by_ordinal.insert((module, ordinal), address);
+    }
+
+    /// Resolve a `(module, ordinal)` to its registered thunk address.
+    pub fn resolve_thunk(&self, module: ModuleId, ordinal: u16) -> Option<u32> {
+        self.thunks_by_ordinal.get(&(module, ordinal)).copied()
+    }
+
+    /// Map a pseudo-`HMODULE` (as returned by `XexGetModuleHandle`) back
+    /// to its `ModuleId`. Returns `None` for unknown handles, including
+    /// the loaded XEX's `image_base` (which is *not* a kernel module).
+    pub fn module_id_from_hmodule(&self, handle: u32) -> Option<ModuleId> {
+        match handle {
+            HMODULE_XBOXKRNL => Some(ModuleId::Xboxkrnl),
+            HMODULE_XAM => Some(ModuleId::Xam),
+            _ => None,
+        }
+    }
+
+    /// Dispatch a kernel export on the current HW thread. Uses `mem::replace`
+    /// to temporarily move the active `PpcContext` out of its scheduler slot,
+    /// so the export function can receive `&mut ctx` while also getting
+    /// `&mut self` (which contains the scheduler). Without this, the export
+    /// signature would have to avoid aliasing via a bundle struct — see the
+    /// approved plan's ExportCtx section for the alternative we rejected.
+    ///
+    /// While the export runs, `scheduler.hw_threads[current_hw_id].ctx` holds
+    /// a freshly-constructed placeholder. Exports that reach through
+    /// `state.scheduler` must not touch the current slot's `ctx` field.
+    ///
+    /// **Perf note (First-Pixels M1):** this function fires ~250K/s on
+    /// Sylpheed (1 import per 40 guest instructions). A former
+    /// `#[tracing::instrument]` attribute + two `tracing::info!` call
+    /// sites made up ~28% of `run_execution` wall time on a post-Tier-3
+    /// profile — most of it in `tracing::span::Span::new` +
+    /// `Layered::new_span` + `ErrorLayer::on_new_span`. The span was at
+    /// `level = "debug"` but the span **construction** happened
+    /// unconditionally; only the emit was level-gated. Removing the
+    /// attribute + the two `info!` lines recovers the overhead without
+    /// losing any observability — the `metrics::counter!("kernel.calls",
+    /// "name" => name)` below still tracks per-export counts, and
+    /// unimplemented lookups still emit a `warn!`.
     pub fn call_export(
         &mut self,
         module: ModuleId,
         ordinal: u32,
-        ctx: &mut PpcContext,
-        mem: &mut GuestMemory,
+        mem: &GuestMemory,
     ) -> bool {
-        if let Some(&(name, func)) = self.exports.get(&(module, ordinal)) {
-            tracing::info!(
-                "Kernel call: {:?}:{:#x} ({}) args=[{:#x}, {:#x}, {:#x}, {:#x}]",
-                module, ordinal, name,
-                ctx.gpr[3], ctx.gpr[4], ctx.gpr[5], ctx.gpr[6]
-            );
-            func(ctx, mem, self);
-            tracing::info!("  -> returned {:#x}", ctx.gpr[3]);
+        // The thread whose ctx we're swapping out must be addressed by
+        // `ThreadRef`, not `hw_id` — under per-slot runqueues a bare
+        // `hw_id` alone can't distinguish multiple threads on the same
+        // slot, and Axis 4 migration can change the slot underneath us.
+        let r = self
+            .scheduler
+            .current
+            .expect("call_export: no current thread");
+        let mut ctx = std::mem::replace(
+            self.scheduler.ctx_mut_ref(r),
+            PpcContext::new(),
+        );
+
+        let result = if let Some(&(name, func)) = self.exports.get(&(module, ordinal)) {
+            metrics::counter!("kernel.calls", "name" => name).increment(1);
+            tracing::trace!(target: "probe_calls", "hw={} call={} r3={:#x} r4={:#x} r5={:#x} lr={:#x}",
+                r.hw_id, name, ctx.gpr[3], ctx.gpr[4], ctx.gpr[5], ctx.lr);
+            func(&mut ctx, mem, self);
             true
         } else {
+            metrics::counter!("kernel.unimplemented").increment(1);
             tracing::warn!(
-                "Unimplemented kernel export: {:?}:{:#x}",
-                module, ordinal
+                module = ?module,
+                ordinal = format_args!("{:#x}", ordinal),
+                "unimplemented kernel export"
             );
-            // Return 0 (STATUS_SUCCESS) by default for unimplemented calls
             ctx.gpr[3] = 0;
             false
+        };
+
+        // Restore the (possibly mutated) ctx by ThreadRef. Axis 4
+        // self-migration (KeSetAffinityThread(NtCurrentThread, ...))
+        // updates `scheduler.current` in place; re-read here so we
+        // restore onto the thread's new slot, not its old one.
+        let final_ref = self.scheduler.current.unwrap_or(r);
+        *self.scheduler.ctx_mut_ref(final_ref) = ctx;
+        result
+    }
+
+    /// Axis 4: `KeSetAffinityThread` orchestration. Drives the scheduler's
+    /// migration and fixes up every `ThreadRef` held outside the
+    /// scheduler (kernel object waiter lists, critical-section waiters,
+    /// `interrupts.injected_ref`). Returns the previous mask.
+    pub fn set_affinity(&mut self, handle: u32, new_mask: u8, mem: &GuestMemory) -> u8 {
+        let Some(r) = self.scheduler.find_by_handle(handle) else {
+            return 0;
+        };
+        let (old_mask, _new_ref, fixup) = self.scheduler.set_affinity_ref(
+            r,
+            new_mask,
+            &mut GuestMemoryPcr(mem),
+        );
+        if let Some(fx) = fixup {
+            use crate::objects::KernelObject;
+            for obj in self.objects.values_mut() {
+                match obj {
+                    KernelObject::Event { waiters, .. }
+                    | KernelObject::Semaphore { waiters, .. }
+                    | KernelObject::Thread { waiters, .. }
+                    | KernelObject::Mutex { waiters, .. } => {
+                        for w in waiters.iter_mut() {
+                            fx.apply(w);
+                        }
+                    }
+                    _ => {}
+                }
+            }
+            for list in self.cs_waiters.values_mut() {
+                for w in list.iter_mut() {
+                    fx.apply(w);
+                }
+            }
+            if let Some(ref mut ir) = self.interrupts.injected_ref {
+                fx.apply(ir);
+            }
         }
+        old_mask
+    }
+
+    /// Install the initial (main) guest thread on HW slot 0. Called once at
+    /// startup after the app allocates the main stack/PCR/TLS blocks.
+    pub fn install_initial_thread(
+        &mut self,
+        ctx: PpcContext,
+        stack_base: u32,
+        stack_size: u32,
+        pcr_base: u32,
+        tls_base: u32,
+        thread_handle: u32,
+        mem: &GuestMemory,
+    ) {
+        self.scheduler.install_initial_thread(
+            ctx,
+            stack_base,
+            stack_size,
+            pcr_base,
+            tls_base,
+            thread_handle,
+            &mut GuestMemoryPcr(mem),
+        );
     }
 
     pub fn export_name(&self, module: ModuleId, ordinal: u32) -> Option<&'static str> {
@@ -96,60 +377,261 @@ impl KernelState {
     }
 
     pub fn alloc_handle(&mut self) -> u32 {
-        let h = self.next_handle;
-        self.next_handle += 4;
-        h
+        // M2.4: lock-free fetch_add. Relaxed is sufficient — IDs are
+        // opaque tokens; no payload is sequenced against the counter.
+        self.next_handle
+            .fetch_add(4, std::sync::atomic::Ordering::Relaxed)
     }
 
     pub fn alloc_handle_for(&mut self, obj: KernelObject) -> u32 {
         let h = self.alloc_handle();
         self.objects.insert(h, obj);
+        // Each fresh handle starts with one logical reference (the creator).
+        // `NtDuplicateObject` bumps this; `NtClose` decrements; the object is
+        // only dropped when the count reaches zero. See `nt_close` for the
+        // aliased-handle rationale.
+        self.handle_refcount.insert(h, 1);
         h
     }
 
+    // ===== Handle audit hooks =====
+    //
+    // These are no-ops when `audit.enabled == false`, so call sites can
+    // unconditionally invoke them without a hot-path branch in release builds
+    // (the `inline` `if !enabled return` short-circuits before any work).
+
+    /// Build a [`HandleAuditEntry`] describing the *current* call-site —
+    /// captures cycle (slot-0 timebase), current `tid`, and `lr` from the
+    /// passed `PpcContext`.
+    fn audit_entry(&self, lr: u32, source: &'static str, aux: u64) -> HandleAuditEntry {
+        let hw_id = self.scheduler.current_hw_id().unwrap_or(0);
+        let cycle = self.scheduler.ctx(hw_id).timebase;
+        let tid = self.scheduler.tid(hw_id).unwrap_or(0);
+        HandleAuditEntry { cycle, tid, lr, source, aux }
+    }
+
+    /// Record the creation of a fresh handle. `kind` is one of the stable
+    /// labels documented on [`crate::audit::HandleAuditTrail::kind`].
+    pub fn audit_create(&mut self, handle: u32, kind: &'static str, lr: u32, source: &'static str) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, 0);
+        self.audit.record_create(handle, kind, entry);
+    }
+
+    /// Record a Set/Pulse/Release/etc. call against a handle. `aux` is the
+    /// previous signal state (or per-export-specific data).
+    pub fn audit_signal(&mut self, handle: u32, lr: u32, source: &'static str, aux: u64) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, aux);
+        self.audit.record_signal(handle, entry);
+    }
+
+    /// Record a `Wait*` call against a handle. `aux` packs `(alertable as u64)
+    /// | (timeout_kind << 8)` etc. — schema is informal; the dump just prints
+    /// it.
+    pub fn audit_wait(&mut self, handle: u32, lr: u32, source: &'static str, aux: u64) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, aux);
+        self.audit.record_wait(handle, entry);
+    }
+
+    /// Record a wake event (called from `wake_eligible_waiters`). `aux`
+    /// is the status code stamped into the woken thread's `gpr[3]`.
+    pub fn audit_wake(&mut self, handle: u32, lr: u32, source: &'static str, aux: u64) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, aux);
+        self.audit.record_wake(handle, entry);
+    }
+
+    /// Read a TLS slot for the currently running HW thread.
     pub fn tls_get(&self, index: u32) -> u64 {
-        self.tls_slots.get(&index).copied().unwrap_or(0)
+        self.scheduler.tls_get(index)
     }
 
+    /// Write a TLS slot for the currently running HW thread.
     pub fn tls_set(&mut self, index: u32, value: u64) {
-        self.tls_slots.insert(index, value);
+        self.scheduler.tls_set(index, value);
     }
 
+    /// Allocate a new global TLS slot index. Grows every HW thread's
+    /// `tls_values` array to match.
     pub fn tls_alloc(&mut self) -> u32 {
-        let idx = self.next_tls_index;
-        self.next_tls_index += 1;
+        use std::sync::atomic::Ordering;
+        // M2.4: atomic bump. The Scheduler::tls_grow_to call still needs
+        // a coherent post-bump value, so we read the new size from the
+        // fetch_add return.
+        let idx = self.next_tls_index.fetch_add(1, Ordering::Relaxed);
+        let new_size = idx + 1;
+        self.scheduler.tls_grow_to(new_size as usize);
         idx
     }
 
     /// Allocate guest memory from the heap bump allocator.
     /// Returns the base address of the allocated region.
-    pub fn heap_alloc(&mut self, size: u32, mem: &mut GuestMemory) -> Option<u32> {
+    pub fn heap_alloc(&mut self, size: u32, mem: &GuestMemory) -> Option<u32> {
+        use std::sync::atomic::Ordering;
         let aligned_size = (size + 0xFFF) & !0xFFF; // Page-align
-        let base = self.heap_cursor;
-        if base.checked_add(aligned_size).is_none() || base + aligned_size > 0x6FFF_FFFF {
+        // M2.4: atomic bump, then verify post-bump invariants. If the
+        // bump pushed us past the heap-region ceiling, the cursor stays
+        // advanced — subsequent allocations also fail, matching the
+        // pre-M2 sequential semantics. We don't try to "undo" the bump
+        // because that opens a CAS-loop race for marginal benefit (a
+        // failing alloc near the limit is already game-over).
+        let base = self.heap_cursor.fetch_add(aligned_size, Ordering::Relaxed);
+        let new_top = base.checked_add(aligned_size)?;
+        if new_top > 0x6FFF_FFFF {
             return None;
         }
         let protect = xenia_memory::page_table::MemoryProtect::READ
             | xenia_memory::page_table::MemoryProtect::WRITE;
-        if mem.alloc(base, aligned_size, protect).is_err() {
-            return None;
-        }
-        self.heap_cursor += aligned_size;
+        mem.alloc(base, aligned_size, protect).ok()?;
         Some(base)
     }
 
     /// Allocate a kernel stack.
-    pub fn stack_alloc(&mut self, size: u32, mem: &mut GuestMemory) -> Option<u32> {
+    pub fn stack_alloc(&mut self, size: u32, mem: &GuestMemory) -> Option<u32> {
+        use std::sync::atomic::Ordering;
         let aligned_size = (size + 0xFFF) & !0xFFF;
-        let base = self.stack_cursor;
+        let base = self.stack_cursor.fetch_add(aligned_size, Ordering::Relaxed);
         let protect = xenia_memory::page_table::MemoryProtect::READ
             | xenia_memory::page_table::MemoryProtect::WRITE;
-        if mem.alloc(base, aligned_size, protect).is_err() {
-            return None;
-        }
-        self.stack_cursor += aligned_size;
+        mem.alloc(base, aligned_size, protect).ok()?;
         Some(base + aligned_size) // Return top of stack
     }
+
+    // ===== Timer subsystem =====
+
+    /// Idempotent arm — removes any prior entry for `handle`, then inserts
+    /// the new `(deadline, handle)` pair and re-sorts by deadline ascending.
+    /// The per-`Timer` object's `deadline` field must be set separately by
+    /// the caller (see `NtSetTimerEx` in exports.rs) — this helper only
+    /// manages the central pending-fires list so `fire_due_timers` has a
+    /// sorted head to peek.
+    pub fn arm_timer(&mut self, handle: u32, deadline: u64) {
+        self.pending_timer_fires.retain(|&(_, h)| h != handle);
+        self.pending_timer_fires.push((deadline, handle));
+        self.pending_timer_fires.sort_by_key(|&(d, _)| d);
+    }
+
+    /// Idempotent disarm — strip any entry for `handle`. Safe to call
+    /// regardless of prior state; `NtClose`, `NtCancelTimer`, and the
+    /// periodic-rearm guard all invoke this.
+    pub fn disarm_timer(&mut self, handle: u32) {
+        self.pending_timer_fires.retain(|&(_, h)| h != handle);
+    }
+
+    /// Peek the earliest pending timer deadline. Paired with
+    /// `Scheduler::earliest_wait_deadline` by the main loop's "advance to
+    /// next event" coordination — the earlier of the two drives
+    /// `advance_all_timebases_to`.
+    pub fn earliest_timer_deadline(&self) -> Option<u64> {
+        self.pending_timer_fires.first().map(|&(d, _)| d)
+    }
+
+    /// Fire every timer whose deadline is `<= now` (derived from slot 0's
+    /// timebase, matching `parse_timeout`'s "current thread" fallback).
+    /// For each fire: mark the timer `signaled=true`, clear its
+    /// `deadline`, rearm if periodic, then wake eligible waiters via
+    /// `exports::wake_eligible_waiters`. Returns `true` iff any timer
+    /// fired — the caller uses this to decide whether the scheduler round
+    /// needs a follow-up `advance_to_next_wake_if_due` step.
+    pub fn fire_due_timers(&mut self) -> bool {
+        let now = self.scheduler.ctx(0).timebase;
+        let mut fired = false;
+        loop {
+            let Some(&(deadline, handle)) = self.pending_timer_fires.first() else {
+                break;
+            };
+            if deadline > now {
+                break;
+            }
+            self.pending_timer_fires.remove(0);
+            // Mark signaled + capture period before any rearm so we don't
+            // double-borrow the object while calling wake_eligible_waiters.
+            let periodic_next =
+                if let Some(KernelObject::Timer {
+                    signaled,
+                    deadline: obj_deadline,
+                    period_ticks,
+                    ..
+                }) = self.objects.get_mut(&handle)
+                {
+                    *signaled = true;
+                    *obj_deadline = None;
+                    if *period_ticks > 0 {
+                        Some(now + *period_ticks)
+                    } else {
+                        None
+                    }
+                } else {
+                    // Closed handle — its entry lingered because disarm on
+                    // NtClose was missed, OR fire_due_timers picked up a
+                    // race. Skip silently; nothing to wake.
+                    None
+                };
+            if let Some(next) = periodic_next {
+                if let Some(KernelObject::Timer { deadline, .. }) =
+                    self.objects.get_mut(&handle)
+                {
+                    *deadline = Some(next);
+                }
+                self.arm_timer(handle, next);
+            }
+            crate::exports::wake_eligible_waiters(self, handle);
+            fired = true;
+        }
+        fired
+    }
+
+    /// Handle deadline-expiry cleanup for a thread whose wait timed out.
+    /// Called by the main loop right after `Scheduler::advance_to_next_wake`
+    /// returns a `Some((ref, reason))`. Stamps `STATUS_TIMEOUT` into the
+    /// woken thread's `gpr[3]` and scrubs its `ThreadRef` out of any
+    /// handle's waiter list so a later signal can't consume the
+    /// auto-reset slot into a stale waiter.
+    ///
+    /// `BlockReason::DelayUntil` is a pure sleep and expects
+    /// `STATUS_SUCCESS` — the default pre-populated value in
+    /// `ke_delay_execution_thread` — so we leave `gpr[3]` alone for it.
+    pub fn handle_timeout_wake(
+        &mut self,
+        r: ThreadRef,
+        reason: xenia_cpu::scheduler::BlockReason,
+    ) {
+        use xenia_cpu::scheduler::BlockReason;
+        const STATUS_TIMEOUT: u64 = 0x0000_0102;
+        match reason {
+            BlockReason::WaitAny { handles, .. } | BlockReason::WaitAll { handles, .. } => {
+                self.scheduler.ctx_mut_ref(r).gpr[3] = STATUS_TIMEOUT;
+                for h in handles {
+                    if let Some(obj) = self.objects.get_mut(&h) {
+                        if let Some(waiters) = obj.waiters_mut() {
+                            waiters.retain(|&w| w != r);
+                        }
+                    }
+                }
+            }
+            BlockReason::DelayUntil(_) => {
+                // Pure sleep → default STATUS_SUCCESS is correct; no handles
+                // to scrub.
+            }
+            BlockReason::CriticalSection(cs_ptr) => {
+                self.scheduler.ctx_mut_ref(r).gpr[3] = STATUS_TIMEOUT;
+                if let Some(list) = self.cs_waiters.get_mut(&cs_ptr) {
+                    list.retain(|&w| w != r);
+                }
+            }
+            BlockReason::Suspended => {}
+        }
+    }
 }
 
 impl Default for KernelState {
@@ -157,3 +639,89 @@ impl Default for KernelState {
         Self::new()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use xenia_memory::GuestMemory;
+
+    /// Ten consecutive `heap_alloc(0x14)` calls must return distinct
+    /// page-aligned addresses. A previous bug had kernel exports passing 0 as
+    /// `size`, causing the bump allocator to return the same address every
+    /// time — 10 "allocations" that all aliased 0x40105000 and silently
+    /// corrupted the guest's static-constructor state.
+    #[test]
+    fn heap_alloc_advances_for_nonzero_size() {
+        let mut mem = GuestMemory::new().expect("memory init");
+        let mut state = KernelState::new();
+        let mut seen = Vec::new();
+        for _ in 0..10 {
+            let addr = state
+                .heap_alloc(0x14, &mut mem)
+                .expect("heap must have room for 0x14 bytes");
+            assert_eq!(addr & 0xFFF, 0, "heap returns page-aligned addresses");
+            assert!(!seen.contains(&addr), "heap returned duplicate address {addr:#x}");
+            seen.push(addr);
+        }
+    }
+
+    /// `heap_alloc(0)` must not advance the cursor (it has nothing to do).
+    /// The kernel exports that previously hit this path did so because they
+    /// read the wrong argument register; guarded at the export boundary now.
+    #[test]
+    fn heap_alloc_zero_is_noop_in_cursor() {
+        use std::sync::atomic::Ordering;
+        let mem = GuestMemory::new().expect("memory init");
+        let mut state = KernelState::new();
+        let before = state.heap_cursor.load(Ordering::Relaxed);
+        let _ = state.heap_alloc(0, &mem);
+        let after = state.heap_cursor.load(Ordering::Relaxed);
+        assert_eq!(before, after, "zero-size alloc must not advance heap cursor");
+    }
+
+    /// M2.4: concurrent handle allocations must produce distinct values.
+    /// Ten threads each allocate 100 handles via `alloc_handle`; the union
+    /// must contain exactly 1000 distinct values, and the maximum equals
+    /// `0x1000 + 4 * (1000 - 1)` (ascending step is 4 per the kernel
+    /// allocator's policy).
+    #[test]
+    fn concurrent_alloc_handle_distinct() {
+        use std::collections::HashSet;
+        use std::sync::Mutex;
+        use std::sync::atomic::{AtomicU32, Ordering};
+
+        // Use a free-standing AtomicU32 mirroring `next_handle`'s semantics;
+        // we can't easily share `&mut KernelState` across threads. The
+        // production code uses the same `fetch_add(4, Relaxed)` recipe.
+        let counter = std::sync::Arc::new(AtomicU32::new(0x1000));
+        let collected: std::sync::Arc<Mutex<HashSet<u32>>> =
+            std::sync::Arc::new(Mutex::new(HashSet::new()));
+
+        let mut handles = Vec::new();
+        for _ in 0..10 {
+            let c = counter.clone();
+            let s = collected.clone();
+            handles.push(std::thread::spawn(move || {
+                let mut local = Vec::with_capacity(100);
+                for _ in 0..100 {
+                    local.push(c.fetch_add(4, Ordering::Relaxed));
+                }
+                let mut g = s.lock().unwrap();
+                for v in local {
+                    g.insert(v);
+                }
+            }));
+        }
+        for h in handles {
+            h.join().unwrap();
+        }
+        let set = collected.lock().unwrap();
+        assert_eq!(
+            set.len(),
+            1000,
+            "expected 1000 distinct handles, got {}",
+            set.len()
+        );
+        assert!(set.iter().all(|h| (h - 0x1000) % 4 == 0));
+    }
+}
diff --git a/crates/xenia-kernel/src/thread.rs b/crates/xenia-kernel/src/thread.rs
new file mode 100644
index 0000000..9f4bc53
--- /dev/null
+++ b/crates/xenia-kernel/src/thread.rs
@@ -0,0 +1,68 @@
+//! Guest-thread image allocation — shared by the initial thread setup in
+//! `xenia-app/src/main.rs` and `ExCreateThread`. Stack, PCR, and TLS blocks
+//! all come from the existing kernel bump allocators so layout is consistent.
+
+use xenia_memory::{GuestMemory, MemoryAccess};
+
+use crate::state::KernelState;
+
+/// Addresses the caller passes to `Scheduler::spawn` / the initial-thread
+/// setup. Matches xenia-canary's per-thread allocations: a stack, a PCR, and
+/// a TLS block.
+#[derive(Debug, Clone, Copy)]
+pub struct ThreadImage {
+    pub stack_base: u32,
+    pub stack_size: u32,
+    pub pcr_base: u32,
+    pub tls_base: u32,
+}
+
+/// Allocate stack + PCR + TLS for one guest thread and initialize the PCR
+/// fields that games read in their thread prolog.
+///
+/// - Stack comes from `KernelState::stack_alloc` (bump allocator at
+///   0x7100_0000 upward). The returned base is the *bottom*; callers
+///   compute SP as `base + size`.
+/// - PCR and TLS are fixed 4 KiB pages allocated via `heap_alloc` so they
+///   land in the user heap region together with other kernel metadata.
+/// - `hw_thread_id` is written at PCR+0x2C so `KeGetCurrentProcessorNumber`-
+///   style reads from r13 resolve correctly even though we never register
+///   that export.
+pub fn allocate_thread_image(
+    kernel: &mut KernelState,
+    mem: &GuestMemory,
+    stack_size: u32,
+    hw_thread_id: u8,
+) -> Option<ThreadImage> {
+    // Round stack size to a page and give games a minimum that matches
+    // xenia-canary's 16 MiB default when callers request 0 (common for
+    // ExCreateThread when the caller lets the kernel pick).
+    let stack_size = if stack_size == 0 {
+        0x10_0000
+    } else {
+        (stack_size + 0xFFF) & !0xFFF
+    };
+    // stack_alloc returns top-of-stack; we need the base.
+    let stack_top = kernel.stack_alloc(stack_size, mem)?;
+    let stack_base = stack_top - stack_size;
+
+    let pcr_base = kernel.heap_alloc(0x1000, mem)?;
+    let tls_base = kernel.heap_alloc(0x1000, mem)?;
+
+    // PCR layout (canary xboxkrnl/xboxkrnl_module.cc, simplified):
+    //   +0x000  tls_ptr               → TLS block base
+    //   +0x02C  current_processor_id  → HW thread id (0..5)
+    //   +0x100  current_thread        → placeholder non-zero tag
+    //   +0x150  dpc_active            → 0 (no DPC queued)
+    mem.write_u32(pcr_base, tls_base);
+    mem.write_u32(pcr_base + 0x2C, hw_thread_id as u32);
+    mem.write_u32(pcr_base + 0x100, 0x1000);
+    mem.write_u32(pcr_base + 0x150, 0);
+
+    Some(ThreadImage {
+        stack_base,
+        stack_size,
+        pcr_base,
+        tls_base,
+    })
+}
diff --git a/crates/xenia-kernel/src/ui_bridge.rs b/crates/xenia-kernel/src/ui_bridge.rs
new file mode 100644
index 0000000..a4b2289
--- /dev/null
+++ b/crates/xenia-kernel/src/ui_bridge.rs
@@ -0,0 +1,185 @@
+//! Bridge between the kernel (CPU-thread side) and a host UI (main-thread side).
+//!
+//! The kernel side needs to:
+//!   - snapshot the latest host gamepad each time a guest calls
+//!     `XamInputGetState`, and
+//!   - signal the UI when the guest calls `VdSwap` so the UI can upload the
+//!     guest's frontbuffer to a wgpu texture and present it.
+//!
+//! Both directions are expressed as trait-object closures so that `xenia-kernel`
+//! does not have to depend on winit/wgpu/gilrs. The [`UiBridge`] is installed
+//! on [`KernelState::ui`] by `cmd_exec` when `--ui` is passed.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, AtomicU64};
+
+use xenia_gpu::texture_cache::TextureKey;
+use xenia_gpu::xenos_constants::XenosConstantsBlock;
+use xenia_hid::GamepadState;
+use xenia_memory::MemoryAccess;
+
+/// Information surfaced to the UI each time the guest presents a frame.
+///
+/// Fields mirror the seven "interesting" arguments to `VdSwap` in
+/// `xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc`: the raw
+/// frontbuffer pointer, its dimensions, and the format/color-space enum values
+/// the guest passed through.
+#[derive(Clone, Copy, Debug)]
+pub struct SwapInfo {
+    /// Guest physical/virtual address of the frontbuffer to present.
+    pub frontbuffer_addr: u32,
+    /// Width in pixels as reported by the guest.
+    pub width: u32,
+    /// Height in pixels as reported by the guest.
+    pub height: u32,
+    /// Xenos texture format enum (the guest passes a pointer; we dereference
+    /// it here). 0 means "unknown / guest passed a null pointer".
+    pub texture_format: u32,
+    /// Color-space enum (sRGB / BT.709 / …).
+    pub color_space: u32,
+    /// Monotonically increasing frame counter maintained by the kernel; useful
+    /// for HUD display and deduping.
+    pub frame_index: u64,
+    /// Total PM4 `DRAW_INDX*` packets the GPU has captured since boot.
+    /// Surfaced so the UI HUD can show progress even before the full
+    /// uber-shader pipeline is wired in.
+    pub draws_total: u64,
+    /// Total PM4 packets executed, across all opcodes — useful signal for
+    /// "is the GPU actually getting anything at all to consume?".
+    pub packets_total: u64,
+    /// Most-recent draw's Xenos primitive-type code (0 = none yet).
+    pub last_draw_prim: u32,
+    /// Most-recent draw's vertex count.
+    pub last_draw_vertex_count: u32,
+    /// Indirect-buffer jumps so far (useful "is the game driving the ring
+    /// buffer through IBs?" signal).
+    pub indirect_buffer_jumps: u64,
+    /// WAIT_REG_MEM stalls observed on the GPU slot.
+    pub wait_reg_mem_blocks: u64,
+    /// Summed CPU instruction count across all 6 HW threads. Mirrors the
+    /// `cycle_count` field each `PpcContext` maintains; gives the HUD a live
+    /// "how far has the guest run?" readout.
+    pub instructions_total: u64,
+    /// Active VS shader blob key at the most recent DRAW_INDX* (0 = none).
+    /// P3b: the UI uses this to index into `handles.shader_blobs` so the
+    /// Xenos uber-shader interpreter can upload the matching microcode.
+    pub vs_blob_key: u32,
+    /// Active PS shader blob key at the most recent DRAW_INDX*.
+    pub ps_blob_key: u32,
+    /// P4: total EDRAM→memory resolves fired since boot (TILE_FLUSH
+    /// events). Non-zero means the game is committing pixels.
+    pub resolves_total: u64,
+    /// Subset of `resolves_total` whose byte-copy path succeeded and wrote
+    /// at least one sample into guest memory.
+    pub resolves_copied_total: u64,
+    /// Subset of `resolves_total` that were skipped by the byte-copy path
+    /// due to an unsupported format / MSAA mode / 3D destination.
+    pub resolves_skipped_total: u64,
+    /// P4: unique RT keys seen (from the GPU's internal render-target
+    /// cache). Grows as the game exercises new RT footprints.
+    pub unique_render_targets: u64,
+    /// P6: total graphics-interrupt callbacks delivered (v-sync + CP).
+    /// Non-zero means `VdSetGraphicsInterruptCallback` has been wired end
+    /// to end and callbacks are actually running.
+    pub interrupts_delivered: u64,
+    /// P6: graphics-interrupts queued but dropped (callback unset,
+    /// thread 0 blocked, or already inside another callback).
+    pub interrupts_dropped: u64,
+}
+
+/// Handles the kernel uses to talk to a running host UI.
+///
+/// None of the closures are allowed to block for long — they are called from
+/// the CPU interpreter thread on the hot path.
+#[derive(Clone)]
+pub struct UiBridge {
+    /// Snapshot the host gamepad. Called from `XamInputGetState`.
+    pub gamepad: Arc<dyn Fn() -> GamepadState + Send + Sync>,
+    /// Report that the guest completed a frame. The closure gets the swap
+    /// metadata plus a borrow of guest memory so it can copy the frontbuffer
+    /// bytes into a UI-owned staging buffer before returning. Called from
+    /// `VdSwap` on the CPU thread.
+    pub post_swap: Arc<dyn Fn(SwapInfo, &dyn MemoryAccess) + Send + Sync>,
+    /// Indicates the UI wants the CPU loop to stop. Checked periodically by
+    /// the interpreter loop.
+    pub shutdown: Arc<AtomicBool>,
+    /// Set to `true` when a gamepad is present. `XamInputGetState` returns
+    /// `ERROR_DEVICE_NOT_CONNECTED` when this is `false`.
+    pub gamepad_connected: Arc<AtomicBool>,
+    /// Live CPU instruction counter mirror. The app's run loop publishes
+    /// the sum of `ctx.cycle_count` across HW threads here every ~8k
+    /// instructions so the HUD can report progress between VdSwap events.
+    pub instructions_counter: Arc<AtomicU64>,
+    /// P3b asset publish: `vd_swap` snapshots the GPU's `shader_blobs` and
+    /// constants register region and feeds them to the UI so the Xenos
+    /// uber-shader interpreter has the microcode + constants needed to
+    /// execute the guest draw. Split from `post_swap` so the asset wire
+    /// stays optional — if the UI doesn't need them (headless mode) the
+    /// closure is a no-op.
+    pub publish_xenos_assets:
+        Arc<dyn Fn(HashMap<u32, Vec<u32>>, XenosConstantsBlock) + Send + Sync>,
+    /// P4 frontbuffer publish: at each `VdSwap`, the kernel CPU-side
+    /// detiles the guest frontbuffer (k_8_8_8_8 Tiled2D) into a linear
+    /// RGBA8 buffer and hands it to the UI. The closure receives
+    /// `(width, height, bytes)` — the UI uploads it as a texture.
+    pub publish_frontbuffer:
+        Arc<dyn Fn(u32, u32, Vec<u8>) + Send + Sync>,
+    /// P5 primary texture publish: at each `VdSwap`, the kernel thread
+    /// decodes the PS shader's primary-texture fetch constant (slot 0
+    /// for now) and hands the decoded linear bytes + key to the UI so
+    /// the xenos pipeline can bind a real texture at `@group(1)`.
+    /// Receives `(TextureKey, bytes)`; when `None` is sent the UI
+    /// reverts to its magenta stub.
+    pub publish_texture:
+        Arc<dyn Fn(Option<(TextureKey, Vec<u8>)>) + Send + Sync>,
+}
+
+impl UiBridge {
+    /// Snapshot input state (user 0 only; higher indices are unconnected).
+    pub fn snapshot_gamepad(&self) -> GamepadState {
+        (self.gamepad)()
+    }
+
+    /// True iff a gamepad is connected for user 0.
+    pub fn is_connected(&self, user_index: u32) -> bool {
+        user_index == 0
+            && self
+                .gamepad_connected
+                .load(std::sync::atomic::Ordering::Relaxed)
+    }
+
+    /// Push a swap event to the UI thread.
+    pub fn notify_swap(&self, info: SwapInfo, mem: &dyn MemoryAccess) {
+        (self.post_swap)(info, mem);
+    }
+
+    /// Snapshot current shader blobs + constants and hand them to the UI.
+    /// Call from `vd_swap` so the UI has the matching assets for every
+    /// draw captured in this frame.
+    pub fn publish_assets(
+        &self,
+        blobs: HashMap<u32, Vec<u32>>,
+        constants: XenosConstantsBlock,
+    ) {
+        (self.publish_xenos_assets)(blobs, constants);
+    }
+
+    /// True iff the UI asked for shutdown.
+    pub fn should_shutdown(&self) -> bool {
+        self.shutdown.load(std::sync::atomic::Ordering::Relaxed)
+    }
+
+    /// Hand a detiled frontbuffer frame to the UI. Called at most once per
+    /// `VdSwap`. `bytes` must be `width * height * 4` bytes in
+    /// `Rgba8Unorm` order (the UI pipeline's expected layout).
+    pub fn publish_frontbuffer(&self, width: u32, height: u32, bytes: Vec<u8>) {
+        (self.publish_frontbuffer)(width, height, bytes);
+    }
+
+    /// Hand one decoded guest texture to the UI. `Some` = update the bound
+    /// slot; `None` = revert to the magenta stub.
+    pub fn publish_texture(&self, tex: Option<(TextureKey, Vec<u8>)>) {
+        (self.publish_texture)(tex);
+    }
+}
diff --git a/crates/xenia-kernel/src/xam.rs b/crates/xenia-kernel/src/xam.rs
index 29d210b..0f98adc 100644
--- a/crates/xenia-kernel/src/xam.rs
+++ b/crates/xenia-kernel/src/xam.rs
@@ -12,10 +12,10 @@ pub fn register_exports(state: &mut KernelState) {
     state.register_export(Xam, 0x02, "NetDll_WSACleanup", stub_success);
 
     // Input
-    state.register_export(Xam, 0x0190, "XamInputGetCapabilities", xam_input_not_connected);
-    state.register_export(Xam, 0x0191, "XamInputGetState", xam_input_not_connected);
-    state.register_export(Xam, 0x0192, "XamInputSetState", xam_input_not_connected);
-    state.register_export(Xam, 0x0198, "XamInputGetKeystrokeEx", xam_input_not_connected);
+    state.register_export(Xam, 0x0190, "XamInputGetCapabilities", xam_input_get_capabilities);
+    state.register_export(Xam, 0x0191, "XamInputGetState", xam_input_get_state);
+    state.register_export(Xam, 0x0192, "XamInputSetState", xam_input_set_state);
+    state.register_export(Xam, 0x0198, "XamInputGetKeystrokeEx", xam_input_get_keystroke);
 
     // Inactivity
     state.register_export(Xam, 0x01A0, "XamEnableInactivityProcessing", stub_success);
@@ -94,39 +94,114 @@ pub fn register_exports(state: &mut KernelState) {
 
 // ===== Generic stubs =====
 
-fn stub_success(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn stub_success(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0;
 }
 
-fn stub_return_zero(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn stub_return_zero(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0;
 }
 
-fn stub_error_no_more_files(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn stub_error_no_more_files(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0x12; // ERROR_NO_MORE_FILES
 }
 
 // ===== Input =====
 
-fn xam_input_not_connected(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
-    ctx.gpr[3] = 0x48F; // ERROR_DEVICE_NOT_CONNECTED
+/// Helper: pack a `GamepadState` into a 12-byte key used to detect input
+/// changes. Cheap to compare across frames.
+fn gamepad_key(state: &xenia_hid::GamepadState) -> u128 {
+    let mut bytes = [0u8; 16];
+    bytes[0..2].copy_from_slice(&state.buttons.to_be_bytes());
+    bytes[2] = state.left_trigger;
+    bytes[3] = state.right_trigger;
+    bytes[4..6].copy_from_slice(&state.left_stick_x.to_be_bytes());
+    bytes[6..8].copy_from_slice(&state.left_stick_y.to_be_bytes());
+    bytes[8..10].copy_from_slice(&state.right_stick_x.to_be_bytes());
+    bytes[10..12].copy_from_slice(&state.right_stick_y.to_be_bytes());
+    u128::from_be_bytes(bytes)
+}
+
+fn xam_input_get_capabilities(
+    ctx: &mut PpcContext,
+    mem: &GuestMemory,
+    state: &mut KernelState,
+) {
+    // r3 = user_index, r4 = flags, r5 = out X_INPUT_CAPABILITIES*
+    let user = ctx.gpr[3] as u32;
+    let out_ptr = ctx.gpr[5] as u32;
+    let connected = state.ui.as_ref().is_some_and(|ui| ui.is_connected(user));
+    if !connected {
+        ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
+        return;
+    }
+    xenia_hid::write_input_capabilities(mem, out_ptr);
+    ctx.gpr[3] = xenia_hid::errors::SUCCESS as u64;
+}
+
+fn xam_input_get_state(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = user_index, r4 = flags, r5 = out X_INPUT_STATE*
+    let user = ctx.gpr[3] as u32;
+    let out_ptr = ctx.gpr[5] as u32;
+    let Some(ui) = state.ui.as_ref() else {
+        ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
+        return;
+    };
+    if !ui.is_connected(user) {
+        ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
+        return;
+    }
+    let gamepad = ui.snapshot_gamepad();
+    let key = gamepad_key(&gamepad);
+    if key != state.last_input_bytes {
+        state.input_packet_number = state.input_packet_number.wrapping_add(1);
+        state.last_input_bytes = key;
+    }
+    xenia_hid::write_input_state(mem, out_ptr, state.input_packet_number, &gamepad);
+    ctx.gpr[3] = xenia_hid::errors::SUCCESS as u64;
+}
+
+fn xam_input_set_state(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
+    // r3 = user_index, r4 = flags, r5 = X_INPUT_VIBRATION*
+    // Rumble is out of scope for Phase 1; we accept the call and return
+    // success so games don't retry in a tight loop, but we never actually
+    // shake anything.
+    let user = ctx.gpr[3] as u32;
+    let connected = state.ui.as_ref().is_some_and(|ui| ui.is_connected(user));
+    if !connected {
+        ctx.gpr[3] = xenia_hid::errors::DEVICE_NOT_CONNECTED as u64;
+        return;
+    }
+    ctx.gpr[3] = xenia_hid::errors::SUCCESS as u64;
+}
+
+fn xam_input_get_keystroke(
+    ctx: &mut PpcContext,
+    _mem: &GuestMemory,
+    _state: &mut KernelState,
+) {
+    // No keyboard input in Phase 1 — always "queue empty". Games that only
+    // use the gamepad ignore this return code; those that drive text entry
+    // through the keystroke queue simply get a permanently empty queue, which
+    // manifests as no virtual-keyboard input — acceptable for minimal UI.
+    ctx.gpr[3] = xenia_hid::errors::EMPTY as u64;
 }
 
 // ===== Loader =====
 
-fn xam_loader_launch_title(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xam_loader_launch_title(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::warn!("XamLoaderLaunchTitle called");
     ctx.gpr[3] = 0;
 }
 
-fn xam_loader_terminate_title(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xam_loader_terminate_title(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     tracing::warn!("XamLoaderTerminateTitle called");
     ctx.gpr[3] = 0;
 }
 
 // ===== Task =====
 
-fn xam_task_schedule(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn xam_task_schedule(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
     let handle = state.alloc_handle();
     tracing::info!("XamTaskSchedule: handle={:#x}", handle);
     ctx.gpr[3] = 0;
@@ -134,7 +209,7 @@ fn xam_task_schedule(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut K
 
 // ===== Alloc =====
 
-fn xam_alloc(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
+fn xam_alloc(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     // r3 = flags, r4 = size, r5 = out_ptr_ptr
     let size = ctx.gpr[4] as u32;
     let out_ptr = ctx.gpr[5] as u32;
@@ -154,7 +229,7 @@ fn xam_alloc(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelStat
 
 // ===== User =====
 
-fn xam_user_get_xuid(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xam_user_get_xuid(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = user_index, r4 = xuid_ptr
     let xuid_ptr = ctx.gpr[4] as u32;
     if xuid_ptr != 0 {
@@ -163,7 +238,7 @@ fn xam_user_get_xuid(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut K
     ctx.gpr[3] = 0;
 }
 
-fn xam_user_get_name(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xam_user_get_name(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = user_index, r4 = buffer, r5 = buffer_size
     let buffer = ctx.gpr[4] as u32;
     if buffer != 0 {
@@ -172,14 +247,14 @@ fn xam_user_get_name(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut K
     ctx.gpr[3] = 0;
 }
 
-fn xam_user_read_profile_settings(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xam_user_read_profile_settings(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     // Return error — no profile
     ctx.gpr[3] = 0x0000_048B; // ERROR_NOT_FOUND
 }
 
 // ===== System =====
 
-fn xam_get_execution_id(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
+fn xam_get_execution_id(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     // r3 = execution_id_ptr_ptr — write pointer to execution info
     let ptr_ptr = ctx.gpr[3] as u32;
     if ptr_ptr != 0 {
@@ -197,25 +272,25 @@ fn xam_get_execution_id(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut
     ctx.gpr[3] = 0;
 }
 
-fn xam_get_system_version(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xam_get_system_version(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0x2000_0000; // System version
 }
 
 // ===== Notify =====
 
-fn xam_notify_create_listener(ctx: &mut PpcContext, _mem: &mut GuestMemory, state: &mut KernelState) {
+fn xam_notify_create_listener(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
     let handle = state.alloc_handle();
     ctx.gpr[3] = handle as u64;
 }
 
-fn xnotify_get_next(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xnotify_get_next(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = handle, r4 = id_ptr, r5 = param_ptr
     ctx.gpr[3] = 0; // FALSE (no notifications)
 }
 
 // ===== Session =====
 
-fn xam_session_create_handle(ctx: &mut PpcContext, mem: &mut GuestMemory, state: &mut KernelState) {
+fn xam_session_create_handle(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     // r3 = handle_ptr
     let handle_ptr = ctx.gpr[3] as u32;
     let handle = state.alloc_handle();
@@ -227,19 +302,19 @@ fn xam_session_create_handle(ctx: &mut PpcContext, mem: &mut GuestMemory, state:
 
 // ===== Locale =====
 
-fn xget_avpack(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xget_avpack(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0x16; // HDMI
 }
 
-fn xget_game_region(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xget_game_region(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 0xFF; // All regions
 }
 
-fn xget_language(ctx: &mut PpcContext, _mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xget_language(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut KernelState) {
     ctx.gpr[3] = 1; // English
 }
 
-fn xget_video_mode(ctx: &mut PpcContext, mem: &mut GuestMemory, _state: &mut KernelState) {
+fn xget_video_mode(ctx: &mut PpcContext, mem: &GuestMemory, _state: &mut KernelState) {
     // r3 = video_mode_ptr
     let ptr = ctx.gpr[3] as u32;
     if ptr != 0 {