//! GPU thread skeleton — types only, no thread spawned yet. //! //! M1 step 1 of the concurrency rollout (see //! `/home/fabi/.claude/plans/good-plese-implement-in-zesty-hickey.md`). //! This module introduces the **shapes** that the eventual GPU host thread //! will use, without changing any runtime behavior: //! //! - [`GpuCommand`] — the control-plane RPC enum sent CPU→GPU. //! - [`GpuHandle`] — the CPU-side proxy: command sender + cloned MMIO //! atomics + an interrupt receiver. Eventually it'll also carry the //! worker thread's `JoinHandle`. //! - [`GpuWorker`] — the GPU-side owned state (the `GpuSystem` itself plus //! the receive end of the command channel and the sender for interrupts). //! It does not yet have a `run()` method; that lands in step 4. //! //! The construction is done via [`GpuSystem::into_handle`], which splits a //! freshly-built `GpuSystem` into `(GpuWorker, GpuHandle)`. The worker keeps //! the actual GPU state plus `cmd_rx`/`int_tx`; the handle carries `cmd_tx`, //! `int_rx`, and clones of the `Arc` MMIO mailboxes so the CPU //! producer side can write WPTR / read RPTR without going through the //! channel. //! //! Until step 4 wires the worker into a real thread, no caller invokes //! `into_handle` on the live `KernelState.gpu` — the constructor exists for //! the unit test below and for the synthetic-test path. use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::thread::{self, JoinHandle}; use std::time::{Duration, Instant}; use crossbeam_channel::{Receiver, Sender, bounded, unbounded}; use xenia_memory::GuestMemory; use crate::gpu_system::{ExecOutcome, GpuMmio, GpuStats, GpuSystem, PendingInterrupt}; /// Reply channel for a [`GpuCommand::DrainFence`]. Single-shot /// `bounded(1)` — the GPU sends `()` once it's drained the ring up to the /// requested wptr, the CPU thread blocks on `recv` until then. M1 step 5 /// is the first user of this; step 1 only validates the type fits. pub type DrainReply = crossbeam_channel::Sender<()>; /// Control-plane RPC the CPU thread sends to the GPU thread. Data-plane /// signals (WPTR/RPTR/INT_STATUS) ride atomic mailboxes instead — see /// [`GpuMmio`]. Channels are for events that need ordered delivery and /// (sometimes) a reply. #[derive(Debug)] pub enum GpuCommand { /// `VdInitializeRingBuffer(base, size_log2)`. The kernel hands the GPU /// the guest-physical base address and dword-size of the primary ring. /// Today's [`GpuSystem::initialize_ring_buffer`] does this synchronously; /// from step 4 onward the kernel sends this command instead. InitializeRing { base: u32, size_log2: u32, }, /// `VdEnableRingBufferRPtrWriteBack(addr, block_size)`. The kernel /// supplies a guest-memory address into which the GPU should mirror its /// internal `read_offset_dwords` after each packet. EnableRptrWriteback { addr: u32, block_size_log2: u32, }, /// Block until the GPU's read pointer has caught up with the supplied /// `target_wptr` (in dwords). The reply is sent on `reply_tx` once the /// drain completes (or the worker hits its internal deadline). Used by /// `vd_swap` to preserve the synchronous "GPU has caught up to the /// guest's wptr at swap time" semantics. DrainFence { target_wptr: u32, reply_tx: DrainReply, }, /// Bump the swap counter and post a swap interrupt. Sent by `vd_swap` /// when the guest commits a frame; the worker's [`GpuSystem::notify_xe_swap`] /// updates `swaps_seen`/`last_swap` and pushes an `InterruptSource::Swap` /// onto the (M1.6) `int_tx` channel. Fire-and-forget — no reply. NotifyXeSwap { frontbuffer_phys: u32, width: u32, height: u32, }, /// Tear-down signal. The worker drains any in-flight reply channels, /// drops its `GpuSystem`, and the host thread joins. Shutdown, } /// CPU-side proxy that the kernel and the interpreter loop hold. It owns: /// /// - The send end of the GPU command channel (`cmd_tx`). /// - The receive end of the GPU→CPU interrupt channel (`int_rx`). Step 6 /// migrates `kernel.gpu.pending_interrupts` onto this channel. /// - Cloned `Arc` MMIO mailboxes so MMIO write callbacks (which /// already capture these `Arc`s) can keep working unchanged after the /// `GpuSystem` itself moves to the worker thread. /// /// `GpuHandle` is `Send + Sync` by virtue of every field being so already /// (`Sender`/`Receiver` from crossbeam are `Send + Sync` for `Send` payloads; /// `Arc` is `Send + Sync`). No explicit impl is needed. /// /// Use [`Self::send_cmd`] to post commands so the parker wake invariant is /// preserved (see M1.7): every channel send must be paired with a wake of /// the worker thread, otherwise the worker can sleep through new commands /// even if `cmd_rx.is_empty()` was momentarily false. #[derive(Debug, Clone)] pub struct GpuHandle { /// Control-plane sender. `clone()`-able so multiple call sites can post /// commands; crossbeam's `Sender` is `Send + Sync + Clone`. pub cmd_tx: Sender, /// Interrupt drain channel. Step 6 moves `pending_interrupts` onto this. /// Until then the `int_tx` half on the worker side is unused. pub int_rx: Receiver, /// Direct access to the MMIO mailbox arcs. The CP_RB_WPTR write that /// the guest does inside the MMIO region callback already lands in /// `mmio.cp_rb_wptr` (an `Arc`); cloning these here lets /// CPU-side code do the same atomic read/write that the inline path /// did, without channel hops. pub mmio: GpuMmio, /// Read-side snapshot of GPU stats / cache sizes. Refreshed by the /// worker each outer loop iteration. Read by [`GpuBackend::digest_snapshot`] /// and the HUD; cheap copy-out under a brief lock acquisition. pub digest: Arc>, /// Shared shutdown flag — set by the CPU side during teardown, read /// by the worker each loop iteration. Cloned here so callers without /// access to the worker side (e.g. drop guards) can still signal exit. pub shutdown: Arc, } impl GpuHandle { /// Post a command to the worker and wake it. Wraps the raw /// `cmd_tx.send` with the M1.7 parker discipline: set /// `wake_pending=true` (Release) and `unpark()` the worker thread. /// Without the wake, channel sends would not surface to a parked /// worker — `crossbeam_channel::Sender::send` doesn't unpark by /// itself. pub fn send_cmd( &self, cmd: GpuCommand, ) -> Result<(), crossbeam_channel::SendError> { let r = self.cmd_tx.send(cmd); if r.is_ok() { self.mmio.wake_pending.store(true, Ordering::Release); if let Ok(g) = self.mmio.worker_thread.lock() { if let Some(t) = g.as_ref() { t.unpark(); } } } r } } /// Periodically-refreshed read-side snapshot of GPU state. Updated by the /// worker thread on each outer loop iteration; consumed by the CPU side /// for the run-digest at end-of-run, the HUD, etc. Held in an /// `Arc>` shared between worker and handle. /// /// Snapshot is intentionally restricted to `u64` counters — they're /// cheap to copy and survive past the worker's lifetime (so the digest /// can be computed even after the worker has shut down). For the /// inline backend [`GpuBackend::digest_snapshot`] computes the same view /// directly from the live `GpuSystem` without any locking. #[derive(Debug, Clone, Default)] pub struct GpuDigestSnapshot { pub stats: GpuStats, pub shader_blobs_live: u64, pub texture_cache_entries: u64, pub texture_decodes: u64, } /// Reverse end of the command and interrupt channels. The GPU thread /// (`GpuWorker::run`) reads from `cmd_rx`, pushes onto `int_tx` (M1.6+), /// owns the actual `GpuSystem`, and refreshes the shared /// `GpuDigestSnapshot`. Built by [`GpuSystem::into_handle`] alongside its /// matching [`GpuHandle`]. /// /// (No `#[derive(Debug)]` because `GpuSystem` itself isn't `Debug`; we /// don't need it on the worker for any production purpose.) pub struct GpuWorker { /// The GPU subsystem itself. The worker thread is its exclusive /// owner once spawned. pub system: GpuSystem, /// Receive end of the control channel. pub cmd_rx: Receiver, /// Send end of the interrupt channel. M1.6 wires this in. pub int_tx: Sender, /// Shared digest snapshot, refreshed each outer loop iteration. pub digest: Arc>, /// Shutdown flag. Set by `shutdown_and_join_with_timeout`; the worker /// loop checks `Acquire` each iteration. pub shutdown: Arc, } impl GpuSystem { /// Split a freshly-built `GpuSystem` into a `(GpuWorker, GpuHandle)` /// pair. The handle keeps cloned `Arc` MMIO mailboxes plus /// the channel sender; the worker keeps the system itself plus the /// channel receiver and the interrupt sender. /// /// Channels are unbounded (`crossbeam_channel::unbounded`) because the /// CPU side never blocks on a control-plane send — guest-driven export /// rates are bounded by the interpreter throughput, and interrupts are /// already coalesced upstream by the kernel. /// /// Caller supplies a shared `shutdown: Arc` so the worker /// and the CPU side can coordinate teardown. For unit tests that don't /// care about lifecycle, [`Self::into_handle_test`] supplies a fresh /// flag. pub fn into_handle_with_shutdown( self, shutdown: Arc, ) -> (GpuWorker, GpuHandle) { let mmio = self.mmio.clone(); let (cmd_tx, cmd_rx) = unbounded::(); let (int_tx, int_rx) = unbounded::(); let digest = Arc::new(std::sync::Mutex::new(GpuDigestSnapshot::default())); let worker = GpuWorker { system: self, cmd_rx, int_tx, digest: digest.clone(), shutdown: shutdown.clone(), }; let handle = GpuHandle { cmd_tx, int_rx, mmio, digest, shutdown, }; (worker, handle) } /// Convenience for tests: allocate a fresh shutdown flag and split. pub fn into_handle(self) -> (GpuWorker, GpuHandle) { self.into_handle_with_shutdown(Arc::new(AtomicBool::new(false))) } } /// Polling interval for the no-op worker's shutdown check. A short sleep /// avoids burning a host core while still keeping shutdown latency under /// 10 ms, well below the 1 s defensive timeout in /// [`shutdown_and_join_with_timeout`]. const NOOP_WORKER_POLL: Duration = Duration::from_millis(2); /// Maximum time the worker waits in `park_timeout` before re-checking /// shutdown / commands / ring state. With `unpark()` on every guest WPTR /// write the typical wake latency is microseconds; this is the upper /// bound for the shutdown / quiescent-state polling cadence. 16 ms aligns /// with vsync cadence on a 60 Hz host and bounds shutdown latency at the /// same value. const WORKER_PARK_TIMEOUT: Duration = Duration::from_millis(16); /// Cap on packets executed per outer-loop iteration before the worker /// re-checks shutdown / commands / digest publish. Mirrors the inline-mode /// `gpu_runs = max(1, min(64, executed_this_round / 6))` pacer ceiling. const WORKER_PACKETS_PER_ITER: u32 = 64; /// Backend for the kernel's `gpu` field. The two variants share a thin /// dispatch layer (forwarding methods on this enum) so call sites in /// `xenia-kernel` exports stay terse. /// /// - [`GpuBackend::Inline`] keeps the legacy synchronous path: the CPU /// thread calls `kernel.gpu.execute_one(mem)` directly each scheduler /// round. Selected by `--gpu-inline` (rollback flag) or implied by /// `--ui` until the UI worker is migrated. /// - [`GpuBackend::Threaded`] (**default at M1.9**) hands `GpuSystem` /// ownership to a dedicated host thread; the CPU thread holds a /// [`GpuHandle`] proxy and talks to the worker via channels + the /// shared MMIO atomics. /// /// `GpuBackend` itself is `Send` (the inline variant carries `GpuSystem`, /// which is Send-able as long as nothing inside it is `!Send` — it isn't); /// the threaded variant carries a `Send + Sync` handle. pub enum GpuBackend { Inline(GpuSystem), Threaded(GpuHandle), } impl GpuBackend { /// Read the MMIO mailbox struct (cheap — `GpuMmio` is `Clone` cloning /// only `Arc`s; we hand back a borrow). The result is the /// same `Arc` set on either backend, so MMIO region /// callbacks installed via [`crate::build_mmio_region`] route guest /// writes to the same atomics the worker reads. pub fn mmio(&self) -> &GpuMmio { match self { GpuBackend::Inline(s) => &s.mmio, GpuBackend::Threaded(h) => &h.mmio, } } /// Convenience: borrow the inline `GpuSystem` for code paths that /// haven't been generalized to the `Threaded` variant yet (vd_swap's /// drain, the various `state.gpu.X` reads in M1.5+ work). Returns /// `None` in threaded mode; the caller's responsibility is to handle /// that gracefully — typically by treating the operation as a no-op /// or routing it through a command (see `vd_swap` notes). pub fn as_inline(&self) -> Option<&GpuSystem> { match self { GpuBackend::Inline(s) => Some(s), GpuBackend::Threaded(_) => None, } } /// Mutable counterpart of [`Self::as_inline`]. pub fn as_inline_mut(&mut self) -> Option<&mut GpuSystem> { match self { GpuBackend::Inline(s) => Some(s), GpuBackend::Threaded(_) => None, } } /// Forward `VdInitializeRingBuffer`. Inline mode applies it directly; /// threaded mode posts an `InitializeRing` command onto the worker /// channel. pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) { match self { GpuBackend::Inline(s) => s.initialize_ring_buffer(base, size_log2), GpuBackend::Threaded(h) => { let _ = h.send_cmd(GpuCommand::InitializeRing { base, size_log2 }); } } } /// Forward `VdEnableRingBufferRPtrWriteBack`. pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) { match self { GpuBackend::Inline(s) => s.enable_rptr_writeback(addr, block_log2), GpuBackend::Threaded(h) => { let _ = h.send_cmd(GpuCommand::EnableRptrWriteback { addr, block_size_log2: block_log2, }); } } } /// Bump `CP_RB_WPTR` by `dwords`. Both backends route the bump through /// the shared MMIO atomic mailbox (`Acquire`-load → wrap-add → /// `Release`-store). Inline mode then picks up the new value on its /// next `sync_with_mmio`; threaded mode's worker observes the same /// atomic and folds it into its ring view. /// /// Note: the value stored is unmodulo'd. The reading side's /// `sync_with_mmio` does the `% ring.size_dwords` step before /// updating the local ring view, which is the only place a `size_dwords` /// reference exists. Practical wptr drift before u32 wraps is /// `2^32 / 64 ≈ 67M` VdSwap-style bumps — safely above any plausible /// single-run total. pub fn extend_write_ptr_by(&mut self, dwords: u32) { let mmio = self.mmio(); // Relaxed is sufficient for the load — we re-store with Release // and the readers (worker `sync_with_mmio` / inline next round) // do their own Acquire. The load here is just a value source. let cur = mmio.cp_rb_wptr.load(Ordering::Relaxed); mmio.cp_rb_wptr .store(cur.wrapping_add(dwords), Ordering::Release); } /// Read the current ring write pointer from the shared MMIO atomic. /// Acquire-load so any prior ring-memory writes the caller did before /// asking are not reordered past this read by the compiler. Used by /// `vd_swap` to compute the ring offset for direct PM4 injection. pub fn mmio_cp_rb_wptr_load(&self) -> u32 { self.mmio().cp_rb_wptr.load(Ordering::Acquire) } /// Drain any PM4 packets currently exposed by the ring (i.e., up to /// the current `CP_RB_WPTR`). Inline mode runs the synchronous /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks /// on the reply channel up to a 1 s defensive timeout — the worker /// has its own ~900 ms internal deadline so the reply is bounded. /// /// The CPU thread blocking here is sound: the only thread that /// satisfies the reply is the GPU worker, which never tries to /// acquire any CPU-side primitive (it talks back exclusively through /// channels and atomics). The lock-ordering argument from the M1.4 /// plan holds: T_cpu → cmd_tx → T_gpu → reply_tx → T_cpu, no cycle. pub fn drain_to_current_wptr(&mut self, mem: &dyn xenia_memory::MemoryAccess) -> u32 { match self { GpuBackend::Inline(s) => { s.sync_with_mmio(); // GPUBUG-DRAIN-001: drain until target WPTR is reached (or // 900 ms deadline), mirroring canary's `WorkerThreadMain` and // the threaded `DrainFence` semantics. The previous fixed // 4096-packet budget hit the cap under `--parallel`, where // CPU runs many more PPC blocks per kernel-callback boundary // and the ring backs up past 4096 packets before vd_swap // fires; the safety-net fallback warning fired twice for // each Sylpheed run. let target = s.mmio.cp_rb_wptr.load(Ordering::Acquire); s.drain_until_wptr(mem, target, Duration::from_millis(900)) } GpuBackend::Threaded(h) => { let target_wptr = h.mmio.cp_rb_wptr.load(Ordering::Acquire); let (reply_tx, reply_rx) = bounded::<()>(1); if h .send_cmd(GpuCommand::DrainFence { target_wptr, reply_tx, }) .is_err() { // Worker disconnected; treat as drained. return 0; } match reply_rx.recv_timeout(Duration::from_secs(1)) { Ok(()) => { // We don't currently track the exact packet count // drained on the threaded path — the worker drains // by `is_ready` predicate. Return 1 as a "drain // happened" sentinel; the inline mode's exact // count is a debug-trace nicety. 1 } Err(_) => { tracing::warn!( target: "gpu", target_wptr, "vd_swap drain fence timed out at 1s; continuing teardown", ); 0 } } } } } /// Bump `swaps_seen` + record `last_swap` + push a swap interrupt. /// Inline calls directly. Threaded sends `NotifyXeSwap` over the /// command channel — fire-and-forget; the worker handles it on its /// next loop iteration. pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) { match self { GpuBackend::Inline(s) => s.notify_xe_swap(frontbuffer_phys, width, height), GpuBackend::Threaded(h) => { let _ = h.send_cmd(GpuCommand::NotifyXeSwap { frontbuffer_phys, width, height, }); } } } /// Forward [`GpuSystem::has_pending_interrupts`] under inline mode; /// under threaded mode peek the `int_rx` channel. pub fn has_pending_interrupts(&self) -> bool { match self { GpuBackend::Inline(s) => s.has_pending_interrupts(), GpuBackend::Threaded(h) => !h.int_rx.is_empty(), } } /// Drain pending interrupts. Inline path forwards to /// [`GpuSystem::take_pending_interrupts`]; threaded path drains the /// channel non-blockingly. (M1 step 6 fully wires the threaded path /// when the worker starts pushing onto `int_tx`; for now, the channel /// is empty in threaded mode at -n 2M, so this still returns an empty /// `Vec`.) pub fn take_pending_interrupts(&mut self) -> Vec { match self { GpuBackend::Inline(s) => s.take_pending_interrupts(), GpuBackend::Threaded(h) => { let mut out = Vec::new(); while let Ok(pi) = h.int_rx.try_recv() { out.push(pi); } out } } } /// End-of-run snapshot used by the run-digest. Inline mode reads /// directly; threaded mode pulls the latest published mirror under a /// brief lock. Returns owned data — safe to use after the worker has /// shut down. pub fn digest_snapshot(&self) -> GpuDigestSnapshot { match self { GpuBackend::Inline(s) => GpuDigestSnapshot { stats: s.stats.clone(), shader_blobs_live: s.shader_blobs.len() as u64, texture_cache_entries: s.texture_cache.len() as u64, texture_decodes: s.texture_cache.decodes_total, }, GpuBackend::Threaded(h) => h .digest .lock() .expect("GpuDigestSnapshot mutex poisoned") .clone(), } } } impl GpuWorker { /// Run loop body for the GPU host thread. /// /// Each iteration: /// 1. Check the `Acquire`-loaded shutdown flag — exit if set. /// 2. Drain any pending control-plane commands non-blockingly. /// 3. Sample MMIO (refreshes WPTR / RPTR mailboxes into the live ring). /// 4. Execute up to [`WORKER_PACKETS_PER_ITER`] PM4 packets while the /// ring is non-empty / not blocked. /// 5. Refresh the shared digest snapshot under a brief lock. /// 6. If no work was done this iteration, sleep /// [`WORKER_IDLE_SLEEP`]. Step 7 swaps this for `park_timeout`. /// /// `memory: Arc` is shared with the CPU thread. The /// worker only ever reads through `&*memory`, which deref-coerces to /// `&GuestMemory` and then to `&dyn MemoryAccess`. All mutations on /// `MemoryAccess` are `&self` post-trait-flip, so concurrent CPU and /// GPU writes are sound under the trait's contract (callers must not /// concurrently read/write the same byte range from different /// threads — vd_swap's RPTR writeback / EVENT_WRITE_SHD writes target /// guest-thread-private addresses by construction). pub fn run(mut self, memory: Arc) { // M1.7 parker registration: publish our `Thread` handle so the // MMIO `CP_RB_WPTR` write callback can `unpark()` us. Only one // worker thread per `GpuMmio`; we replace whatever was there. if let Ok(mut g) = self.system.mmio.worker_thread.lock() { *g = Some(thread::current()); } loop { // (1) shutdown if self.shutdown.load(Ordering::Acquire) { break; } // (2) drain commands let mut did_work = false; while let Ok(cmd) = self.cmd_rx.try_recv() { did_work = true; match cmd { GpuCommand::InitializeRing { base, size_log2 } => { self.system.initialize_ring_buffer(base, size_log2); } GpuCommand::EnableRptrWriteback { addr, block_size_log2, } => { self.system.enable_rptr_writeback(addr, block_size_log2); } GpuCommand::DrainFence { target_wptr: _, reply_tx, } => { // Drain the ring up to whatever WPTR the MMIO // atomic currently exposes (the CPU side bumped // it before sending the fence). Bounded by an // internal 900 ms deadline — 100 ms tighter than // the CPU's `recv_timeout(1s)` so the timeout // surfaces on the CPU side as a clean // `RecvTimeout` rather than a partial drain that // looks complete. // // The drain loop polls `is_ready` after each // packet; `sync_with_mmio` between packets is // what folds late guest WPTR writes into the // local ring view. Loop exits when the ring is // empty (rptr == wptr after modulo) or a packet // returns `Idle`/`Blocked`. self.system.sync_with_mmio(); let deadline = Instant::now() + Duration::from_millis(900); while self.system.is_ready(&*memory) { if Instant::now() >= deadline { break; } match self.system.execute_one(&*memory) { ExecOutcome::Stepped { .. } => { self.system.sync_with_mmio(); } ExecOutcome::Idle | ExecOutcome::Blocked => break, } } // GPUBUG-DRAIN-001: publish the digest mirror BEFORE // replying so the CPU's post-drain `digest_snapshot` // observes the `swaps_seen` bump from any // PM4_XE_SWAP we just consumed. Without this the // outer-loop publish at step 5b races the CPU's // post_swap_counter check and the kernel-side // `vd_swap` fires the "PM4_XE_SWAP not consumed" // safety-net warning even when the swap landed. let snap = GpuDigestSnapshot { stats: self.system.stats.clone(), shader_blobs_live: self.system.shader_blobs.len() as u64, texture_cache_entries: self.system.texture_cache.len() as u64, texture_decodes: self.system.texture_cache.decodes_total, }; if let Ok(mut g) = self.digest.lock() { *g = snap; } let _ = reply_tx.send(()); } GpuCommand::NotifyXeSwap { frontbuffer_phys, width, height, } => { self.system .notify_xe_swap(frontbuffer_phys, width, height); } GpuCommand::Shutdown => { self.shutdown.store(true, Ordering::Release); return; } } } // (3,4) drive the GPU self.system.sync_with_mmio(); let mut budget = WORKER_PACKETS_PER_ITER; while budget > 0 && self.system.is_ready(&*memory) { match self.system.execute_one(&*memory) { ExecOutcome::Stepped { .. } => { did_work = true; self.system.sync_with_mmio(); } ExecOutcome::Idle | ExecOutcome::Blocked => break, } budget -= 1; } // (5a) M1.6: forward `PM4_INTERRUPT` / `XE_SWAP` events from // `system.pending_interrupts` onto `int_tx`. The Vec // lives on this thread; the channel is the cross-thread // delivery primitive. Send is non-blocking (unbounded) // and the receive end is drained by the CPU thread's // per-round queue at `main.rs::run_execution`. // `int_tx.send` returns `Err` only if the receiver was // dropped — which means the CPU side is gone, in which // case we'd be torn down momentarily anyway. for pi in self.system.take_pending_interrupts() { if self.int_tx.send(pi).is_err() { break; } did_work = true; } // (5b) publish digest snapshot if did_work { let snap = GpuDigestSnapshot { stats: self.system.stats.clone(), shader_blobs_live: self.system.shader_blobs.len() as u64, texture_cache_entries: self.system.texture_cache.len() as u64, texture_decodes: self.system.texture_cache.decodes_total, }; if let Ok(mut g) = self.digest.lock() { *g = snap; } } // (6) M1.7 parker — `park_timeout` replaces the polling // sleep. The standard parker idiom defends against the // producer-races-park lost-wakeup: // // 1. Swap `wake_pending` to false (claim "we're going to // park"). If `was_pending` is true, a producer // signaled us between the last work and now — skip // the park, loop and re-process. // 2. Re-check side conditions (cmd channel, shutdown). // These may have changed after step 1. // 3. `park_timeout`. If the producer's `unpark()` runs // between our re-check and our park call, std's token // records it and the next park returns immediately. // If neither happens within `WORKER_PARK_TIMEOUT`, we // wake on our own and re-evaluate. if !did_work { let was_pending = self.system.mmio.wake_pending.swap(false, Ordering::AcqRel); if !was_pending && self.cmd_rx.is_empty() && !self.shutdown.load(Ordering::Acquire) { thread::park_timeout(WORKER_PARK_TIMEOUT); } } } // Clear the wake target on exit so post-shutdown MMIO writes // don't try to unpark a dead thread (sound — `Thread::unpark` // on an exited thread is a no-op — but it keeps the invariant // tidy). if let Ok(mut g) = self.system.mmio.worker_thread.lock() { *g = None; } } } /// Spawn the real GPU worker thread. Returns its `JoinHandle`; the /// matching `GpuHandle` (caller's existing one from /// [`GpuSystem::into_handle_with_shutdown`]) is what the CPU thread keeps. pub fn spawn_gpu_worker( worker: GpuWorker, memory: Arc, ) -> JoinHandle<()> { thread::Builder::new() .name("xenia-gpu".to_string()) .spawn(move || worker.run(memory)) .expect("spawn xenia-gpu worker thread") } /// M1 step 3 — spawn a placeholder GPU worker thread that does nothing /// except poll `shutdown` on a short cadence and exit cleanly when it sees /// `true`. Verifies thread lifecycle, signal propagation, and clean /// teardown. **Not used in production paths**: in step 4 the worker grows a /// real `GpuWorker::run` body that owns a `GpuSystem`. Until then this /// function is the only spawn site, gated behind `--gpu-thread` in the CLI. /// /// The function returns the `JoinHandle` so the caller can block on /// teardown via [`shutdown_and_join_with_timeout`]. Ownership of the /// `Arc` is shared: the caller keeps a clone for signaling, /// the thread takes another clone for polling. /// /// Rationale for `Acquire`/`Release` ordering on the bool: the spawning /// thread may set up shared state *before* the worker reads it once we /// expand the worker in step 4 — the `Release` store on shutdown then /// pairs with the `Acquire` load here so any prior writes the spawner did /// (e.g. populating channels with farewell messages) are visible to the /// worker. For the no-op stage there's no shared state, but using the /// stricter ordering now means step 4 inherits a correctly-fenced /// shutdown protocol with no further changes. pub fn spawn_noop_worker(shutdown: Arc) -> JoinHandle<()> { thread::Builder::new() .name("xenia-gpu-noop".to_string()) .spawn(move || { while !shutdown.load(Ordering::Acquire) { thread::sleep(NOOP_WORKER_POLL); } }) .expect("spawn xenia-gpu-noop worker thread") } /// Signal `shutdown` to the worker and join its thread, with a defensive /// timeout so a misbehaving worker can't wedge the entire process. Logs at /// `error!` if the timeout fires (which would indicate either the worker /// loop ignoring `shutdown` or being parked on a primitive that wasn't /// woken — both are bugs the user should hear about). /// /// Returns `Ok(())` on a clean join inside the timeout, `Err(())` on /// timeout. The caller decides whether to continue process teardown anyway /// (typically yes — the worker's only state is its own stack). pub fn shutdown_and_join_with_timeout( shutdown: &Arc, handle: JoinHandle<()>, timeout: Duration, ) -> Result<(), ()> { shutdown.store(true, Ordering::Release); // No `JoinHandle::join_timeout` in std; emulate via a side-channel // signal from the polling. We use a sentinel-clone of the JoinHandle // approach: spawn a watchdog that, after the timeout, sets a // "give up" flag — but `join` is a blocking call we can't preempt. // Instead use a parking helper: spawn a hop-thread to call join, and // wait on it via a `crossbeam_channel::after` select. let (tx, rx) = unbounded::<()>(); let join_thread = thread::Builder::new() .name("xenia-gpu-joiner".to_string()) .spawn(move || { let _ = handle.join(); // `_ = tx.send(())` — receiver may already be dropped if // we timed out, in which case Err is fine. let _ = tx.send(()); }) .expect("spawn xenia-gpu-joiner thread"); crossbeam_channel::select! { recv(rx) -> _ => { // Joiner finished within the budget. Reap it (no work — the // thread already returned). `join` here is fast. let _ = join_thread.join(); Ok(()) } recv(crossbeam_channel::after(timeout)) -> _ => { tracing::error!( target: "gpu", ?timeout, "GPU worker did not exit in time; leaking thread to avoid wedging shutdown", ); // Detach the joiner; it leaks but at least we proceed. Will // get cleaned up when the process exits. std::mem::drop(join_thread); Err(()) } } } #[cfg(test)] mod tests { use super::*; /// Roundtrip an `InitializeRing` command: CPU side sends, worker side /// receives, payload bytes match. This validates the channel plumbing /// without touching any GPU semantics. #[test] fn initialize_ring_roundtrips_through_handle() { let (worker, handle) = GpuSystem::new().into_handle(); let GpuWorker { cmd_rx, .. } = worker; handle .cmd_tx .send(GpuCommand::InitializeRing { base: 0x1000_0000, size_log2: 18, }) .expect("cmd_tx send"); match cmd_rx.recv().expect("cmd_rx recv") { GpuCommand::InitializeRing { base, size_log2 } => { assert_eq!(base, 0x1000_0000); assert_eq!(size_log2, 18); } other => panic!("unexpected cmd: {other:?}"), } } /// MMIO atomics on the handle and the worker's GpuSystem must be the /// same Arc (clone). A guest write through the MMIO region callback /// stores into `handle.mmio.cp_rb_wptr`; the worker observes the same /// value via `worker.system.mmio.cp_rb_wptr`. If we accidentally /// allocated a fresh atomic for either side, the worker would never see /// guest writes. #[test] fn mmio_arcs_are_shared_between_handle_and_worker() { use std::sync::atomic::Ordering; let (worker, handle) = GpuSystem::new().into_handle(); handle.mmio.cp_rb_wptr.store(0xC0FFEE, Ordering::Release); assert_eq!( worker.system.mmio.cp_rb_wptr.load(Ordering::Acquire), 0xC0FFEE, "worker side did not observe handle-side atomic store", ); } /// `GpuHandle` must be `Send + Sync`. Compile-time assertion via type /// constraint — won't link if the bound is violated. #[test] fn handle_is_send_sync() { fn assert_send_sync() {} assert_send_sync::(); } /// `GpuWorker` must be `Send` so we can move it onto a thread in step 3. /// (`Sync` is not required — only one thread ever owns the worker.) #[test] fn worker_is_send() { fn assert_send() {} assert_send::(); } /// Dropping the handle (CPU side) must not block recv on the worker /// side; instead it must surface as `Disconnected`. This is the /// standard crossbeam guarantee but we pin it down with an explicit /// test so a future refactor (e.g. switching channel backends) can't /// silently change semantics. #[test] fn dropping_handle_disconnects_command_channel() { let (worker, handle) = GpuSystem::new().into_handle(); drop(handle); let result = worker.cmd_rx.recv(); assert!( matches!(result, Err(crossbeam_channel::RecvError)), "expected Disconnected after handle drop, got {result:?}", ); } /// Spawn the no-op worker, signal shutdown immediately, and join. Must /// complete within a generous timeout (the polling cadence is 2 ms, /// so 250 ms gives plenty of headroom even on a loaded test runner). #[test] fn noop_worker_shuts_down_cleanly() { let shutdown = Arc::new(AtomicBool::new(false)); let handle = spawn_noop_worker(shutdown.clone()); // Brief gap so the worker enters its loop at least once before we // signal — exercises the in-loop exit path rather than the never- // entered-loop case. thread::sleep(Duration::from_millis(5)); let outcome = shutdown_and_join_with_timeout(&shutdown, handle, Duration::from_millis(250)); assert_eq!(outcome, Ok(()), "no-op worker did not join in budget"); } /// A worker that never exits must surface as a timeout, not a hang. /// Validates the `crossbeam_channel::after` budget. We construct a /// pseudo-worker that ignores `shutdown` for the entire timeout /// window, then exits — the test budget is short enough to force the /// timeout path. #[test] fn shutdown_join_timeouts_on_misbehaving_worker() { let shutdown = Arc::new(AtomicBool::new(false)); let handle = thread::Builder::new() .name("test-misbehaving-worker".to_string()) .spawn(|| { // Sleep longer than the test's join budget. The test // proves the join helper returns `Err(())` rather than // blocking the test process indefinitely. thread::sleep(Duration::from_millis(500)); }) .expect("spawn misbehaving worker"); let outcome = shutdown_and_join_with_timeout(&shutdown, handle, Duration::from_millis(50)); assert_eq!(outcome, Err(()), "expected timeout signal"); } /// M1.8 — `write_u32_fence` / `read_u32_fence` ordering test. A /// producer thread writes a "data" value, then a "fence" value via /// `write_u32_fence`. A consumer thread spin-reads the fence via /// `read_u32_fence` and, on observing the producer's update, reads /// the data via plain `read_u32`. The data must always equal the /// producer's pre-fence write — never an older value or a torn read. /// /// On x86_64 (TSO) this would pass even without the fences; on /// weaker architectures it would fail without them. We pin down the /// invariant here so future ports / refactors can't silently weaken /// it. Uses a small synthetic memory implementing `MemoryAccess`. #[test] fn write_u32_fence_publishes_prior_writes() { use std::sync::atomic::{AtomicBool, AtomicU32}; use std::time::Instant; use xenia_memory::MemoryAccess; // The test's MemoryAccess impl uses `AtomicU32` slots so the // multi-byte u32 reads/writes are torn-read-free. The fence // helper layers Release/Acquire on top — without atomic // storage, byte-by-byte reads on a writer-racing buffer would // see torn values regardless of the fence. const SLOT_COUNT: usize = 16; struct ScopedMem([AtomicU32; SLOT_COUNT]); impl ScopedMem { fn slot(&self, addr: u32) -> &AtomicU32 { &self.0[(addr / 4) as usize] } } impl MemoryAccess for ScopedMem { fn read_u8(&self, addr: u32) -> u8 { let v = self.slot(addr & !3).load(Ordering::Relaxed); let shift = (addr & 3) * 8; (v >> shift) as u8 } fn read_u16(&self, addr: u32) -> u16 { u16::from_le_bytes([self.read_u8(addr), self.read_u8(addr + 1)]) } fn read_u32(&self, addr: u32) -> u32 { self.slot(addr).load(Ordering::Relaxed) } fn read_u64(&self, addr: u32) -> u64 { let lo = self.read_u32(addr) as u64; let hi = self.read_u32(addr + 4) as u64; lo | (hi << 32) } fn write_u8(&self, _addr: u32, _val: u8) { unimplemented!("test fixture only writes u32") } fn write_u16(&self, _addr: u32, _val: u16) { unimplemented!("test fixture only writes u32") } fn write_u32(&self, addr: u32, val: u32) { self.slot(addr).store(val, Ordering::Relaxed); } fn write_u64(&self, addr: u32, val: u64) { self.write_u32(addr, val as u32); self.write_u32(addr + 4, (val >> 32) as u32); } fn translate(&self, _addr: u32) -> Option<*const u8> { None } fn translate_mut(&self, _addr: u32) -> Option<*mut u8> { None } } let mem: Arc = Arc::new(ScopedMem(std::array::from_fn(|_| { AtomicU32::new(0) }))); // Initialize fence and data slots to zero. mem.write_u32(0, 0); // data mem.write_u32(16, 0); // fence let stop = Arc::new(AtomicBool::new(false)); let mem_p = mem.clone(); let stop_p = stop.clone(); let producer = thread::Builder::new() .name("fence-producer".into()) .spawn(move || { for i in 1u32..=10_000 { if stop_p.load(Ordering::Relaxed) { break; } mem_p.write_u32(0, i); // data mem_p.write_u32_fence(16, i); // fence (Release) thread::yield_now(); } }) .expect("spawn producer"); let mem_c = mem.clone(); let consumer = thread::Builder::new() .name("fence-consumer".into()) .spawn(move || { let deadline = Instant::now() + Duration::from_millis(500); let mut last_seen = 0u32; let mut iters = 0u32; while Instant::now() < deadline { let f = mem_c.read_u32_fence(16); // Acquire if f != last_seen { let d = mem_c.read_u32(0); // The data we read after the fence must be at // least as new as the fence value (producer // wrote `data = i; fence(i)` in that order). assert!( d >= f, "fence ordering violated: data={d} fence={f}" ); last_seen = f; iters += 1; } } iters }) .expect("spawn consumer"); let observed = consumer.join().expect("consumer join"); stop.store(true, Ordering::Relaxed); let _ = producer.join(); assert!( observed > 0, "consumer never observed a fence transition (race scheduler too unfair?)", ); } /// Spawning two no-op workers in parallel and joining both must /// succeed without interference — proves the joiner side-thread /// pattern doesn't accidentally serialize teardown. #[test] fn two_concurrent_noop_workers_both_shut_down() { let shutdown_a = Arc::new(AtomicBool::new(false)); let handle_a = spawn_noop_worker(shutdown_a.clone()); let shutdown_b = Arc::new(AtomicBool::new(false)); let handle_b = spawn_noop_worker(shutdown_b.clone()); let r_a = shutdown_and_join_with_timeout( &shutdown_a, handle_a, Duration::from_millis(250), ); let r_b = shutdown_and_join_with_timeout( &shutdown_b, handle_b, Duration::from_millis(250), ); assert_eq!(r_a, Ok(())); assert_eq!(r_b, Ok(())); } }