xenia-kernel: HLE expansion, scheduler integration, audit + UI bridge

Major HLE buildout in exports.rs: KeInitializeSemaphore now seeds count/limit, XexGet{Module,Procedure}Address use distinct HMODULE_XBOXKRNL/HMODULE_XAM pseudo-handles with a reverse (ModuleId,ordinal)→thunk_addr map, plus sweeping additions across sync primitives, file I/O, semaphores, events, threads, and allocator paths needed to advance Sylpheed past VdSwap=2. New modules: - thread.rs — ThreadRef + per-thread suspension/wake plumbing - interrupts.rs — IRQ delivery, pending-IRQ slots, IPI helpers - path.rs — guest path normalization (D:\\, game:\\, etc.) - audit.rs — --trace-handles harness backing the handle audit - ui_bridge.rs — kernel-side endpoint of the xenia-ui bridge (input snapshots, framebuffer publish handles) state.rs grows to own the HW-slot scheduler state, the new audit / UI bridge handles, and the per-handle reverse maps. xam.rs and objects.rs follow suit for the HLE additions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 16:29:00 +02:00
parent f1fadb5398
commit 5f0d6487ea
11 changed files with 6369 additions and 270 deletions
--- a/crates/xenia-kernel/src/state.rs
+++ b/crates/xenia-kernel/src/state.rs
@@ -1,11 +1,35 @@
 use std::collections::HashMap;
-use xenia_cpu::PpcContext;
-use xenia_memory::GuestMemory;
+use xenia_cpu::scheduler::{PcrWriter, Scheduler};
+use xenia_cpu::{PpcContext, ThreadRef};
+use xenia_memory::{GuestMemory, MemoryAccess};
+use xenia_vfs::VfsDevice;

+use crate::audit::{HandleAudit, HandleAuditEntry};
 use crate::objects::KernelObject;
+use crate::ui_bridge::UiBridge;
+
+/// Adapter: write PCR+0x2C on guest memory. Lets `Scheduler::spawn` and
+/// Axis 4's migration call through without `xenia-cpu` depending on the
+/// memory crate.
+pub struct GuestMemoryPcr<'a>(pub &'a GuestMemory);
+impl PcrWriter for GuestMemoryPcr<'_> {
+    fn write_pcr_id(&mut self, pcr_base: u32, hw_id: u8) {
+        // `GuestMemory::write_u32` takes `&self` post-M2 trait flip; the
+        // wrapping `&'a GuestMemory` is sufficient.
+        self.0.write_u32(pcr_base + 0x2C, hw_id as u32);
+    }
+}

 /// Function signature for HLE kernel exports.
-pub type KernelExportFn = fn(&mut PpcContext, &mut GuestMemory, &mut KernelState);
+///
+/// The first argument is the **currently running** HW thread's `PpcContext`,
+/// which the caller has temporarily moved out of the scheduler slot to avoid
+/// aliasing. Exports that only touch register/GPR state use `ctx` directly;
+/// exports that need scheduler state (spawn/park/wake/tls/etc.) reach
+/// through `state.scheduler` — note that `state.scheduler.hw_threads[current]`
+/// holds a placeholder `PpcContext` for the duration of the call, not the
+/// live one passed as `ctx`.
+pub type KernelExportFn = fn(&mut PpcContext, &GuestMemory, &mut KernelState);

 /// Module identifier for kernel exports.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -15,45 +39,174 @@ pub enum ModuleId {
    Xbdm,
 }

+/// Pseudo-`HMODULE` values returned by `XexGetModuleHandle` and accepted by
+/// `XexGetProcedureAddress`. Distinct from real loaded-image bases
+/// (>=0x82000000) and from kernel handles (0x1000+, allocated by
+/// `alloc_handle`). The 0xFFFE_xxxx prefix is unused by both guest segments
+/// and our handle allocator.
+pub const HMODULE_XBOXKRNL: u32 = 0xFFFE_0001;
+pub const HMODULE_XAM: u32 = 0xFFFE_0002;
+
 /// Central kernel state tracking all guest OS state.
 pub struct KernelState {
    exports: HashMap<(ModuleId, u32), (&'static str, KernelExportFn)>,
-    next_handle: u32,
-    pub tls_slots: HashMap<u32, u64>,
-    next_tls_index: u32,
+    /// M2.4: bump allocator for kernel handles. `AtomicU32` so concurrent
+    /// HLE calls under M3 can `fetch_add` without a lock. `Relaxed` is
+    /// fine — the allocated value is a fresh ID with no prior payload to
+    /// publish; observers (the kernel object table) are guarded by
+    /// their own synchronization.
+    next_handle: std::sync::atomic::AtomicU32,
+    /// Scheduler managing all emulated HW threads + their per-slot
+    /// runqueues. Starts empty — the app installs the initial guest thread
+    /// on slot 0 via `KernelState::install_initial_thread` once it has the
+    /// entry address.
+    pub scheduler: Scheduler,
+    /// TLS slot allocator — index counter only. Per-thread *values* live on
+    /// `GuestThread::tls_values` (see scheduler). M2.4: `AtomicU32`.
+    pub next_tls_index: std::sync::atomic::AtomicU32,
+    /// Critical-section waiter map: guest `cs_ptr` → guest threads parked
+    /// on it. Critical sections are in guest memory (not kernel objects),
+    /// so their waiter list lives here rather than on an object.
+    pub cs_waiters: HashMap<u32, Vec<ThreadRef>>,
    /// Kernel object table: handle → object
    pub objects: HashMap<u32, KernelObject>,
-    /// Bump allocator for guest heap (NtAllocateVirtualMemory etc.)
-    pub heap_cursor: u32,
-    /// Stack allocator cursor for MmCreateKernelStack
-    pub stack_cursor: u32,
+    /// Bump allocator for guest heap (NtAllocateVirtualMemory etc.).
+    /// M2.4: `AtomicU32` for lock-free concurrent allocation.
+    pub heap_cursor: std::sync::atomic::AtomicU32,
+    /// Stack allocator cursor for MmCreateKernelStack. M2.4: atomic.
+    pub stack_cursor: std::sync::atomic::AtomicU32,
    /// GPU command buffer address (set by VdGetSystemCommandBuffer)
    pub gpu_command_buffer: u32,
+    /// GPU backend. M1.4: was `xenia_gpu::GpuSystem` directly, now a
+    /// [`xenia_gpu::GpuBackend`] enum so the kernel can hold either an
+    /// inline `GpuSystem` (synchronous, default) or a `GpuHandle` proxy
+    /// pointing at a worker thread (`--gpu-thread`). Forwarding methods
+    /// on the enum keep call sites in [`crate::exports`] terse.
+    pub gpu: xenia_gpu::GpuBackend,
+    /// Monotonic packet number returned by `XamInputGetState`. Games detect
+    /// input changes by watching this increment.
+    pub input_packet_number: u32,
+    /// Previous gamepad snapshot; `input_packet_number` only advances when
+    /// the state bytes actually change, matching host XInput semantics.
+    pub last_input_bytes: u128,
    /// Image base of the loaded XEX (for XexExecutableModuleHandle etc.)
    pub image_base: u32,
-    /// Next thread ID
-    pub next_thread_id: u32,
+    /// Next thread ID. M2.4: atomic.
+    pub next_thread_id: std::sync::atomic::AtomicU32,
+    /// Virtual file system for NtCreateFile/NtReadFile/etc. The app mounts
+    /// the disc image or host directory into this slot; file I/O handlers
+    /// route all reads through it.
+    pub vfs: Option<Box<dyn VfsDevice>>,
+    /// Bridge to the host UI. `None` when running headless. Installed by
+    /// `cmd_exec` when the user passes `--ui`.
+    pub ui: Option<UiBridge>,
+    /// P6 — graphics interrupt + synthetic v-sync bookkeeping. Registers
+    /// the callback set by `VdSetGraphicsInterruptCallback` and tracks
+    /// the paused-context snapshot while HW thread 0 is running it.
+    pub interrupts: crate::interrupts::InterruptState,
+    /// Per-handle refcount. Since `NtDuplicateObject` aliases (returns the
+    /// source handle value as the "new" handle rather than minting a fresh
+    /// id), a single handle commonly has multiple logical references. This
+    /// map tracks that count so a stray `NtClose` on one reference doesn't
+    /// destroy the object while another reference is still live. Canary's
+    /// `ObjectTable::ReleaseHandle` (object_table.cc:189) is the parity
+    /// reference. Initialized to 1 in `alloc_handle_for`; incremented in
+    /// `nt_duplicate_object` when `DUPLICATE_CLOSE_SOURCE` is absent;
+    /// decremented in `nt_close` which drops the underlying object only
+    /// when the count reaches zero.
+    pub handle_refcount: HashMap<u32, u32>,
+    /// Pending timer expirations — `(deadline, handle)` sorted ascending by
+    /// deadline. Pushed by `arm_timer`, popped by `fire_due_timers`. Kept in
+    /// lockstep with the per-`Timer` object's `deadline` field via the
+    /// `arm_timer`/`disarm_timer` helpers. See the plan's step 3/6 for the
+    /// design rationale — timer deadlines coexist with
+    /// `Scheduler::timed_waits` but track a different class (signaled object
+    /// fires, not thread wake-ups).
+    pub pending_timer_fires: Vec<(u64, u32)>,
+    /// Per-handle signal/wait/wake audit trail. Default `enabled=false` →
+    /// every record method is a no-op. Flip via `--trace-handles`/
+    /// `XENIA_TRACE_HANDLES` to diagnose missing-signal deadlocks (handles
+    /// 0x10FC / 0x1014 / 0x1104 / 0x10DC / 0x10F0 specifically). See
+    /// [`crate::audit`] for layout.
+    pub audit: HandleAudit,
+    /// M2.2 — banked reservation table for `lwarx`/`stwcx.` under M3's
+    /// per-HW-thread parallelism. Always allocated. Consulted by the
+    /// interpreter when `reservations.is_enabled()` is true; otherwise
+    /// the legacy per-`PpcContext` fields drive observable behavior.
+    /// Settable via `--reservations-table` / `XENIA_RESERVATIONS_TABLE=1`
+    /// for golden verification, or implicitly under `--parallel`.
+    /// See [`xenia_cpu::ReservationTable`] for the concurrency model.
+    pub reservations: std::sync::Arc<xenia_cpu::ReservationTable>,
+    /// Map from `(module, ordinal)` to the guest-side import-thunk address
+    /// resolved at load time. Reverse of `xenia-app/src/main.rs`'s
+    /// `thunk_map`. Populated from xenia-app's Phase 1 (record_type==1
+    /// only). Used by `xex_get_procedure_address` to resolve ordinals back
+    /// to callable thunks.
+    thunks_by_ordinal: HashMap<(ModuleId, u16), u32>,
+    /// First-Pixels diagnostic latch. Set the first time
+    /// `RtlRaiseException` fires with code `0xE06D7363` (MSVC C++ throw)
+    /// so the deep stack-walk + `runtime_error` decode in
+    /// `rtl_raise_exception` only emits once per run, regardless of how
+    /// many subsequent throws fire. Reset on each fresh process start.
+    pub cxx_throw_logged: bool,
 }

 impl KernelState {
-    pub fn new() -> Self {
+    /// Construct a kernel with the supplied GPU backend.
+    ///
+    /// The caller (typically `cmd_exec_inner`) decides whether to install
+    /// an inline backend (default) or a threaded one (`--gpu-thread`).
+    /// Most existing call sites build via [`Self::new`], which defaults to
+    /// an inline backend; the threaded constructor lives at
+    /// [`Self::with_gpu`].
+    pub fn with_gpu(gpu: xenia_gpu::GpuBackend) -> Self {
+        // Scheduler starts empty; the app installs the initial thread on
+        // slot 0 via `install_initial_thread` right after construction.
+        let mut scheduler = Scheduler::new();
+        use std::sync::atomic::AtomicU32;
+        let reservations = std::sync::Arc::new(xenia_cpu::ReservationTable::new());
+        // M3.7 — wire the reservation table to the scheduler so
+        // `spawn`/`install_initial_thread` populate every PpcContext's
+        // `reservation_table` clone. The table is `disabled` by
+        // default; `--reservations-table` / `XENIA_RESERVATIONS_TABLE`
+        // / M3 spawn flip it on.
+        scheduler.set_reservation_table(Some(reservations.clone()));
        let mut state = Self {
            exports: HashMap::new(),
-            next_handle: 0x1000,
-            tls_slots: HashMap::new(),
-            next_tls_index: 0,
+            next_handle: AtomicU32::new(0x1000),
+            scheduler,
+            next_tls_index: AtomicU32::new(0),
+            cs_waiters: HashMap::new(),
            objects: HashMap::new(),
-            heap_cursor: 0x4000_0000, // Start of user heap region
-            stack_cursor: 0x7100_0000, // Above main stack
+            heap_cursor: AtomicU32::new(0x4000_0000), // Start of user heap region
+            stack_cursor: AtomicU32::new(0x7100_0000), // Above main stack
            gpu_command_buffer: 0,
+            gpu,
+            input_packet_number: 0,
+            last_input_bytes: 0,
            image_base: 0,
-            next_thread_id: 1,
+            next_thread_id: AtomicU32::new(1),
+            vfs: None,
+            ui: None,
+            interrupts: crate::interrupts::InterruptState::default(),
+            handle_refcount: HashMap::new(),
+            pending_timer_fires: Vec::new(),
+            audit: HandleAudit::default(),
+            reservations,
+            thunks_by_ordinal: HashMap::new(),
+            cxx_throw_logged: false,
        };
        crate::exports::register_exports(&mut state);
        crate::xam::register_exports(&mut state);
        state
    }

+    /// Default constructor — installs an inline `GpuSystem`. Kept for
+    /// callers that don't (yet) thread a `GpuBackend` choice through.
+    pub fn new() -> Self {
+        Self::with_gpu(xenia_gpu::GpuBackend::Inline(xenia_gpu::GpuSystem::new()))
+    }
+
    pub fn register_export(
        &mut self,
        module: ModuleId,
@@ -64,31 +217,159 @@ impl KernelState {
        self.exports.insert((module, ordinal), (name, func));
    }

+    /// Record an import-thunk address resolved at load time. Called once
+    /// per `record_type==1` import in xenia-app's Phase 1. Idempotent: a
+    /// duplicate ordinal overwrites (later wins; in practice the loader
+    /// emits each ordinal once per module).
+    pub fn register_thunk(&mut self, module: ModuleId, ordinal: u16, address: u32) {
+        self.thunks_by_ordinal.insert((module, ordinal), address);
+    }
+
+    /// Resolve a `(module, ordinal)` to its registered thunk address.
+    pub fn resolve_thunk(&self, module: ModuleId, ordinal: u16) -> Option<u32> {
+        self.thunks_by_ordinal.get(&(module, ordinal)).copied()
+    }
+
+    /// Map a pseudo-`HMODULE` (as returned by `XexGetModuleHandle`) back
+    /// to its `ModuleId`. Returns `None` for unknown handles, including
+    /// the loaded XEX's `image_base` (which is *not* a kernel module).
+    pub fn module_id_from_hmodule(&self, handle: u32) -> Option<ModuleId> {
+        match handle {
+            HMODULE_XBOXKRNL => Some(ModuleId::Xboxkrnl),
+            HMODULE_XAM => Some(ModuleId::Xam),
+            _ => None,
+        }
+    }
+
+    /// Dispatch a kernel export on the current HW thread. Uses `mem::replace`
+    /// to temporarily move the active `PpcContext` out of its scheduler slot,
+    /// so the export function can receive `&mut ctx` while also getting
+    /// `&mut self` (which contains the scheduler). Without this, the export
+    /// signature would have to avoid aliasing via a bundle struct — see the
+    /// approved plan's ExportCtx section for the alternative we rejected.
+    ///
+    /// While the export runs, `scheduler.hw_threads[current_hw_id].ctx` holds
+    /// a freshly-constructed placeholder. Exports that reach through
+    /// `state.scheduler` must not touch the current slot's `ctx` field.
+    ///
+    /// **Perf note (First-Pixels M1):** this function fires ~250K/s on
+    /// Sylpheed (1 import per 40 guest instructions). A former
+    /// `#[tracing::instrument]` attribute + two `tracing::info!` call
+    /// sites made up ~28% of `run_execution` wall time on a post-Tier-3
+    /// profile — most of it in `tracing::span::Span::new` +
+    /// `Layered::new_span` + `ErrorLayer::on_new_span`. The span was at
+    /// `level = "debug"` but the span **construction** happened
+    /// unconditionally; only the emit was level-gated. Removing the
+    /// attribute + the two `info!` lines recovers the overhead without
+    /// losing any observability — the `metrics::counter!("kernel.calls",
+    /// "name" => name)` below still tracks per-export counts, and
+    /// unimplemented lookups still emit a `warn!`.
    pub fn call_export(
        &mut self,
        module: ModuleId,
        ordinal: u32,
-        ctx: &mut PpcContext,
-        mem: &mut GuestMemory,
+        mem: &GuestMemory,
    ) -> bool {
-        if let Some(&(name, func)) = self.exports.get(&(module, ordinal)) {
-            tracing::info!(
-                "Kernel call: {:?}:{:#x} ({}) args=[{:#x}, {:#x}, {:#x}, {:#x}]",
-                module, ordinal, name,
-                ctx.gpr[3], ctx.gpr[4], ctx.gpr[5], ctx.gpr[6]
-            );
-            func(ctx, mem, self);
-            tracing::info!("  -> returned {:#x}", ctx.gpr[3]);
+        // The thread whose ctx we're swapping out must be addressed by
+        // `ThreadRef`, not `hw_id` — under per-slot runqueues a bare
+        // `hw_id` alone can't distinguish multiple threads on the same
+        // slot, and Axis 4 migration can change the slot underneath us.
+        let r = self
+            .scheduler
+            .current
+            .expect("call_export: no current thread");
+        let mut ctx = std::mem::replace(
+            self.scheduler.ctx_mut_ref(r),
+            PpcContext::new(),
+        );
+
+        let result = if let Some(&(name, func)) = self.exports.get(&(module, ordinal)) {
+            metrics::counter!("kernel.calls", "name" => name).increment(1);
+            tracing::trace!(target: "probe_calls", "hw={} call={} r3={:#x} r4={:#x} r5={:#x} lr={:#x}",
+                r.hw_id, name, ctx.gpr[3], ctx.gpr[4], ctx.gpr[5], ctx.lr);
+            func(&mut ctx, mem, self);
            true
        } else {
+            metrics::counter!("kernel.unimplemented").increment(1);
            tracing::warn!(
-                "Unimplemented kernel export: {:?}:{:#x}",
-                module, ordinal
+                module = ?module,
+                ordinal = format_args!("{:#x}", ordinal),
+                "unimplemented kernel export"
            );
-            // Return 0 (STATUS_SUCCESS) by default for unimplemented calls
            ctx.gpr[3] = 0;
            false
+        };
+
+        // Restore the (possibly mutated) ctx by ThreadRef. Axis 4
+        // self-migration (KeSetAffinityThread(NtCurrentThread, ...))
+        // updates `scheduler.current` in place; re-read here so we
+        // restore onto the thread's new slot, not its old one.
+        let final_ref = self.scheduler.current.unwrap_or(r);
+        *self.scheduler.ctx_mut_ref(final_ref) = ctx;
+        result
+    }
+
+    /// Axis 4: `KeSetAffinityThread` orchestration. Drives the scheduler's
+    /// migration and fixes up every `ThreadRef` held outside the
+    /// scheduler (kernel object waiter lists, critical-section waiters,
+    /// `interrupts.injected_ref`). Returns the previous mask.
+    pub fn set_affinity(&mut self, handle: u32, new_mask: u8, mem: &GuestMemory) -> u8 {
+        let Some(r) = self.scheduler.find_by_handle(handle) else {
+            return 0;
+        };
+        let (old_mask, _new_ref, fixup) = self.scheduler.set_affinity_ref(
+            r,
+            new_mask,
+            &mut GuestMemoryPcr(mem),
+        );
+        if let Some(fx) = fixup {
+            use crate::objects::KernelObject;
+            for obj in self.objects.values_mut() {
+                match obj {
+                    KernelObject::Event { waiters, .. }
+                    | KernelObject::Semaphore { waiters, .. }
+                    | KernelObject::Thread { waiters, .. }
+                    | KernelObject::Mutex { waiters, .. } => {
+                        for w in waiters.iter_mut() {
+                            fx.apply(w);
+                        }
+                    }
+                    _ => {}
+                }
+            }
+            for list in self.cs_waiters.values_mut() {
+                for w in list.iter_mut() {
+                    fx.apply(w);
+                }
+            }
+            if let Some(ref mut ir) = self.interrupts.injected_ref {
+                fx.apply(ir);
+            }
        }
+        old_mask
+    }
+
+    /// Install the initial (main) guest thread on HW slot 0. Called once at
+    /// startup after the app allocates the main stack/PCR/TLS blocks.
+    pub fn install_initial_thread(
+        &mut self,
+        ctx: PpcContext,
+        stack_base: u32,
+        stack_size: u32,
+        pcr_base: u32,
+        tls_base: u32,
+        thread_handle: u32,
+        mem: &GuestMemory,
+    ) {
+        self.scheduler.install_initial_thread(
+            ctx,
+            stack_base,
+            stack_size,
+            pcr_base,
+            tls_base,
+            thread_handle,
+            &mut GuestMemoryPcr(mem),
+        );
    }

    pub fn export_name(&self, module: ModuleId, ordinal: u32) -> Option<&'static str> {
@@ -96,60 +377,261 @@ impl KernelState {
    }

    pub fn alloc_handle(&mut self) -> u32 {
-        let h = self.next_handle;
-        self.next_handle += 4;
-        h
+        // M2.4: lock-free fetch_add. Relaxed is sufficient — IDs are
+        // opaque tokens; no payload is sequenced against the counter.
+        self.next_handle
+            .fetch_add(4, std::sync::atomic::Ordering::Relaxed)
    }

    pub fn alloc_handle_for(&mut self, obj: KernelObject) -> u32 {
        let h = self.alloc_handle();
        self.objects.insert(h, obj);
+        // Each fresh handle starts with one logical reference (the creator).
+        // `NtDuplicateObject` bumps this; `NtClose` decrements; the object is
+        // only dropped when the count reaches zero. See `nt_close` for the
+        // aliased-handle rationale.
+        self.handle_refcount.insert(h, 1);
        h
    }

+    // ===== Handle audit hooks =====
+    //
+    // These are no-ops when `audit.enabled == false`, so call sites can
+    // unconditionally invoke them without a hot-path branch in release builds
+    // (the `inline` `if !enabled return` short-circuits before any work).
+
+    /// Build a [`HandleAuditEntry`] describing the *current* call-site —
+    /// captures cycle (slot-0 timebase), current `tid`, and `lr` from the
+    /// passed `PpcContext`.
+    fn audit_entry(&self, lr: u32, source: &'static str, aux: u64) -> HandleAuditEntry {
+        let hw_id = self.scheduler.current_hw_id().unwrap_or(0);
+        let cycle = self.scheduler.ctx(hw_id).timebase;
+        let tid = self.scheduler.tid(hw_id).unwrap_or(0);
+        HandleAuditEntry { cycle, tid, lr, source, aux }
+    }
+
+    /// Record the creation of a fresh handle. `kind` is one of the stable
+    /// labels documented on [`crate::audit::HandleAuditTrail::kind`].
+    pub fn audit_create(&mut self, handle: u32, kind: &'static str, lr: u32, source: &'static str) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, 0);
+        self.audit.record_create(handle, kind, entry);
+    }
+
+    /// Record a Set/Pulse/Release/etc. call against a handle. `aux` is the
+    /// previous signal state (or per-export-specific data).
+    pub fn audit_signal(&mut self, handle: u32, lr: u32, source: &'static str, aux: u64) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, aux);
+        self.audit.record_signal(handle, entry);
+    }
+
+    /// Record a `Wait*` call against a handle. `aux` packs `(alertable as u64)
+    /// | (timeout_kind << 8)` etc. — schema is informal; the dump just prints
+    /// it.
+    pub fn audit_wait(&mut self, handle: u32, lr: u32, source: &'static str, aux: u64) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, aux);
+        self.audit.record_wait(handle, entry);
+    }
+
+    /// Record a wake event (called from `wake_eligible_waiters`). `aux`
+    /// is the status code stamped into the woken thread's `gpr[3]`.
+    pub fn audit_wake(&mut self, handle: u32, lr: u32, source: &'static str, aux: u64) {
+        if !self.audit.enabled {
+            return;
+        }
+        let entry = self.audit_entry(lr, source, aux);
+        self.audit.record_wake(handle, entry);
+    }
+
+    /// Read a TLS slot for the currently running HW thread.
    pub fn tls_get(&self, index: u32) -> u64 {
-        self.tls_slots.get(&index).copied().unwrap_or(0)
+        self.scheduler.tls_get(index)
    }

+    /// Write a TLS slot for the currently running HW thread.
    pub fn tls_set(&mut self, index: u32, value: u64) {
-        self.tls_slots.insert(index, value);
+        self.scheduler.tls_set(index, value);
    }

+    /// Allocate a new global TLS slot index. Grows every HW thread's
+    /// `tls_values` array to match.
    pub fn tls_alloc(&mut self) -> u32 {
-        let idx = self.next_tls_index;
-        self.next_tls_index += 1;
+        use std::sync::atomic::Ordering;
+        // M2.4: atomic bump. The Scheduler::tls_grow_to call still needs
+        // a coherent post-bump value, so we read the new size from the
+        // fetch_add return.
+        let idx = self.next_tls_index.fetch_add(1, Ordering::Relaxed);
+        let new_size = idx + 1;
+        self.scheduler.tls_grow_to(new_size as usize);
        idx
    }

    /// Allocate guest memory from the heap bump allocator.
    /// Returns the base address of the allocated region.
-    pub fn heap_alloc(&mut self, size: u32, mem: &mut GuestMemory) -> Option<u32> {
+    pub fn heap_alloc(&mut self, size: u32, mem: &GuestMemory) -> Option<u32> {
+        use std::sync::atomic::Ordering;
        let aligned_size = (size + 0xFFF) & !0xFFF; // Page-align
-        let base = self.heap_cursor;
-        if base.checked_add(aligned_size).is_none() || base + aligned_size > 0x6FFF_FFFF {
+        // M2.4: atomic bump, then verify post-bump invariants. If the
+        // bump pushed us past the heap-region ceiling, the cursor stays
+        // advanced — subsequent allocations also fail, matching the
+        // pre-M2 sequential semantics. We don't try to "undo" the bump
+        // because that opens a CAS-loop race for marginal benefit (a
+        // failing alloc near the limit is already game-over).
+        let base = self.heap_cursor.fetch_add(aligned_size, Ordering::Relaxed);
+        let new_top = base.checked_add(aligned_size)?;
+        if new_top > 0x6FFF_FFFF {
            return None;
        }
        let protect = xenia_memory::page_table::MemoryProtect::READ
            | xenia_memory::page_table::MemoryProtect::WRITE;
-        if mem.alloc(base, aligned_size, protect).is_err() {
-            return None;
-        }
-        self.heap_cursor += aligned_size;
+        mem.alloc(base, aligned_size, protect).ok()?;
        Some(base)
    }

    /// Allocate a kernel stack.
-    pub fn stack_alloc(&mut self, size: u32, mem: &mut GuestMemory) -> Option<u32> {
+    pub fn stack_alloc(&mut self, size: u32, mem: &GuestMemory) -> Option<u32> {
+        use std::sync::atomic::Ordering;
        let aligned_size = (size + 0xFFF) & !0xFFF;
-        let base = self.stack_cursor;
+        let base = self.stack_cursor.fetch_add(aligned_size, Ordering::Relaxed);
        let protect = xenia_memory::page_table::MemoryProtect::READ
            | xenia_memory::page_table::MemoryProtect::WRITE;
-        if mem.alloc(base, aligned_size, protect).is_err() {
-            return None;
-        }
-        self.stack_cursor += aligned_size;
+        mem.alloc(base, aligned_size, protect).ok()?;
        Some(base + aligned_size) // Return top of stack
    }
+
+    // ===== Timer subsystem =====
+
+    /// Idempotent arm — removes any prior entry for `handle`, then inserts
+    /// the new `(deadline, handle)` pair and re-sorts by deadline ascending.
+    /// The per-`Timer` object's `deadline` field must be set separately by
+    /// the caller (see `NtSetTimerEx` in exports.rs) — this helper only
+    /// manages the central pending-fires list so `fire_due_timers` has a
+    /// sorted head to peek.
+    pub fn arm_timer(&mut self, handle: u32, deadline: u64) {
+        self.pending_timer_fires.retain(|&(_, h)| h != handle);
+        self.pending_timer_fires.push((deadline, handle));
+        self.pending_timer_fires.sort_by_key(|&(d, _)| d);
+    }
+
+    /// Idempotent disarm — strip any entry for `handle`. Safe to call
+    /// regardless of prior state; `NtClose`, `NtCancelTimer`, and the
+    /// periodic-rearm guard all invoke this.
+    pub fn disarm_timer(&mut self, handle: u32) {
+        self.pending_timer_fires.retain(|&(_, h)| h != handle);
+    }
+
+    /// Peek the earliest pending timer deadline. Paired with
+    /// `Scheduler::earliest_wait_deadline` by the main loop's "advance to
+    /// next event" coordination — the earlier of the two drives
+    /// `advance_all_timebases_to`.
+    pub fn earliest_timer_deadline(&self) -> Option<u64> {
+        self.pending_timer_fires.first().map(|&(d, _)| d)
+    }
+
+    /// Fire every timer whose deadline is `<= now` (derived from slot 0's
+    /// timebase, matching `parse_timeout`'s "current thread" fallback).
+    /// For each fire: mark the timer `signaled=true`, clear its
+    /// `deadline`, rearm if periodic, then wake eligible waiters via
+    /// `exports::wake_eligible_waiters`. Returns `true` iff any timer
+    /// fired — the caller uses this to decide whether the scheduler round
+    /// needs a follow-up `advance_to_next_wake_if_due` step.
+    pub fn fire_due_timers(&mut self) -> bool {
+        let now = self.scheduler.ctx(0).timebase;
+        let mut fired = false;
+        loop {
+            let Some(&(deadline, handle)) = self.pending_timer_fires.first() else {
+                break;
+            };
+            if deadline > now {
+                break;
+            }
+            self.pending_timer_fires.remove(0);
+            // Mark signaled + capture period before any rearm so we don't
+            // double-borrow the object while calling wake_eligible_waiters.
+            let periodic_next =
+                if let Some(KernelObject::Timer {
+                    signaled,
+                    deadline: obj_deadline,
+                    period_ticks,
+                    ..
+                }) = self.objects.get_mut(&handle)
+                {
+                    *signaled = true;
+                    *obj_deadline = None;
+                    if *period_ticks > 0 {
+                        Some(now + *period_ticks)
+                    } else {
+                        None
+                    }
+                } else {
+                    // Closed handle — its entry lingered because disarm on
+                    // NtClose was missed, OR fire_due_timers picked up a
+                    // race. Skip silently; nothing to wake.
+                    None
+                };
+            if let Some(next) = periodic_next {
+                if let Some(KernelObject::Timer { deadline, .. }) =
+                    self.objects.get_mut(&handle)
+                {
+                    *deadline = Some(next);
+                }
+                self.arm_timer(handle, next);
+            }
+            crate::exports::wake_eligible_waiters(self, handle);
+            fired = true;
+        }
+        fired
+    }
+
+    /// Handle deadline-expiry cleanup for a thread whose wait timed out.
+    /// Called by the main loop right after `Scheduler::advance_to_next_wake`
+    /// returns a `Some((ref, reason))`. Stamps `STATUS_TIMEOUT` into the
+    /// woken thread's `gpr[3]` and scrubs its `ThreadRef` out of any
+    /// handle's waiter list so a later signal can't consume the
+    /// auto-reset slot into a stale waiter.
+    ///
+    /// `BlockReason::DelayUntil` is a pure sleep and expects
+    /// `STATUS_SUCCESS` — the default pre-populated value in
+    /// `ke_delay_execution_thread` — so we leave `gpr[3]` alone for it.
+    pub fn handle_timeout_wake(
+        &mut self,
+        r: ThreadRef,
+        reason: xenia_cpu::scheduler::BlockReason,
+    ) {
+        use xenia_cpu::scheduler::BlockReason;
+        const STATUS_TIMEOUT: u64 = 0x0000_0102;
+        match reason {
+            BlockReason::WaitAny { handles, .. } | BlockReason::WaitAll { handles, .. } => {
+                self.scheduler.ctx_mut_ref(r).gpr[3] = STATUS_TIMEOUT;
+                for h in handles {
+                    if let Some(obj) = self.objects.get_mut(&h) {
+                        if let Some(waiters) = obj.waiters_mut() {
+                            waiters.retain(|&w| w != r);
+                        }
+                    }
+                }
+            }
+            BlockReason::DelayUntil(_) => {
+                // Pure sleep → default STATUS_SUCCESS is correct; no handles
+                // to scrub.
+            }
+            BlockReason::CriticalSection(cs_ptr) => {
+                self.scheduler.ctx_mut_ref(r).gpr[3] = STATUS_TIMEOUT;
+                if let Some(list) = self.cs_waiters.get_mut(&cs_ptr) {
+                    list.retain(|&w| w != r);
+                }
+            }
+            BlockReason::Suspended => {}
+        }
+    }
 }

 impl Default for KernelState {
@@ -157,3 +639,89 @@ impl Default for KernelState {
        Self::new()
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use xenia_memory::GuestMemory;
+
+    /// Ten consecutive `heap_alloc(0x14)` calls must return distinct
+    /// page-aligned addresses. A previous bug had kernel exports passing 0 as
+    /// `size`, causing the bump allocator to return the same address every
+    /// time — 10 "allocations" that all aliased 0x40105000 and silently
+    /// corrupted the guest's static-constructor state.
+    #[test]
+    fn heap_alloc_advances_for_nonzero_size() {
+        let mut mem = GuestMemory::new().expect("memory init");
+        let mut state = KernelState::new();
+        let mut seen = Vec::new();
+        for _ in 0..10 {
+            let addr = state
+                .heap_alloc(0x14, &mut mem)
+                .expect("heap must have room for 0x14 bytes");
+            assert_eq!(addr & 0xFFF, 0, "heap returns page-aligned addresses");
+            assert!(!seen.contains(&addr), "heap returned duplicate address {addr:#x}");
+            seen.push(addr);
+        }
+    }
+
+    /// `heap_alloc(0)` must not advance the cursor (it has nothing to do).
+    /// The kernel exports that previously hit this path did so because they
+    /// read the wrong argument register; guarded at the export boundary now.
+    #[test]
+    fn heap_alloc_zero_is_noop_in_cursor() {
+        use std::sync::atomic::Ordering;
+        let mem = GuestMemory::new().expect("memory init");
+        let mut state = KernelState::new();
+        let before = state.heap_cursor.load(Ordering::Relaxed);
+        let _ = state.heap_alloc(0, &mem);
+        let after = state.heap_cursor.load(Ordering::Relaxed);
+        assert_eq!(before, after, "zero-size alloc must not advance heap cursor");
+    }
+
+    /// M2.4: concurrent handle allocations must produce distinct values.
+    /// Ten threads each allocate 100 handles via `alloc_handle`; the union
+    /// must contain exactly 1000 distinct values, and the maximum equals
+    /// `0x1000 + 4 * (1000 - 1)` (ascending step is 4 per the kernel
+    /// allocator's policy).
+    #[test]
+    fn concurrent_alloc_handle_distinct() {
+        use std::collections::HashSet;
+        use std::sync::Mutex;
+        use std::sync::atomic::{AtomicU32, Ordering};
+
+        // Use a free-standing AtomicU32 mirroring `next_handle`'s semantics;
+        // we can't easily share `&mut KernelState` across threads. The
+        // production code uses the same `fetch_add(4, Relaxed)` recipe.
+        let counter = std::sync::Arc::new(AtomicU32::new(0x1000));
+        let collected: std::sync::Arc<Mutex<HashSet<u32>>> =
+            std::sync::Arc::new(Mutex::new(HashSet::new()));
+
+        let mut handles = Vec::new();
+        for _ in 0..10 {
+            let c = counter.clone();
+            let s = collected.clone();
+            handles.push(std::thread::spawn(move || {
+                let mut local = Vec::with_capacity(100);
+                for _ in 0..100 {
+                    local.push(c.fetch_add(4, Ordering::Relaxed));
+                }
+                let mut g = s.lock().unwrap();
+                for v in local {
+                    g.insert(v);
+                }
+            }));
+        }
+        for h in handles {
+            h.join().unwrap();
+        }
+        let set = collected.lock().unwrap();
+        assert_eq!(
+            set.len(),
+            1000,
+            "expected 1000 distinct handles, got {}",
+            set.len()
+        );
+        assert!(set.iter().all(|h| (h - 0x1000) % 4 == 0));
+    }
+}