//! Xenos GPU system: register file + primary ring buffer + PM4 executor. //! //! Design notes mirror the approved plan's P2 slice: //! //! - Runs on the same host thread as the CPU interpreter. Sequential access //! to `GuestMemory` — no locks, no sharing. //! - One packet per [`GpuSystem::execute_one`] call. The scheduler calls this //! once per round when `is_ready` returns true. When the packet is a //! `WAIT_REG_MEM` whose condition isn't yet satisfied, we transition to //! [`GpuState::Blocked`] and the scheduler will re-poll us. //! - Non-draw opcodes execute for real (register/memory writes, event //! writebacks). Draws (`DRAW_INDX*`, `XE_SWAP`) are classified but not //! rendered yet; they surface state (via spans + the swap hook) for later //! phases to consume. //! //! Opcode reference: `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h`. use std::collections::HashMap; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::time::{Duration, Instant}; use xenia_memory::MemoryAccess; use crate::draw_state::{self, DrawState}; use crate::pm4::{self, PacketKind}; use crate::primitive::{self, ProcessedPrimitive}; use crate::register_file::RegisterFile; use crate::ring_view::RingBufferView; /// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets. #[derive(Debug, Clone)] pub struct ShaderBlob { pub shader_type: u8, // 0 = vertex, 1 = pixel pub dwords: Vec, } /// P8 — upper bound on the shader-blob cache (`GpuSystem.shader_blobs`). /// Canary uses a similar FIFO ceiling; our number is deliberately generous /// because blobs are small (a few KiB each at most) and misses force a /// re-upload from the guest the next time `IM_LOAD*` fires. 256 is enough /// for every shipping game's peak working set, per canary's traces. pub const SHADER_BLOB_CAP: usize = 256; /// PM4 `WAIT_REG_MEM` condition. Bits 0..=2 of the wait_info dword encode the /// comparison (per `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h:685-696`). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum WaitCmp { /// value < ref Less, /// value <= ref LessEq, /// value == ref Equal, /// value != ref NotEqual, /// value >= ref GreaterEq, /// value > ref Greater, /// Always — caller wants to sleep regardless. Always, } impl WaitCmp { /// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`. pub fn from_wait_info(wait_info: u32) -> Self { match wait_info & 0x7 { 0 => WaitCmp::Less, 1 => WaitCmp::LessEq, 2 => WaitCmp::Equal, 3 => WaitCmp::NotEqual, 4 => WaitCmp::GreaterEq, 5 => WaitCmp::Greater, _ => WaitCmp::Always, } } pub fn evaluate(self, value: u32, reference: u32) -> bool { match self { WaitCmp::Less => value < reference, WaitCmp::LessEq => value <= reference, WaitCmp::Equal => value == reference, WaitCmp::NotEqual => value != reference, WaitCmp::GreaterEq => value >= reference, WaitCmp::Greater => value > reference, WaitCmp::Always => true, } } } /// Reason the GPU is currently parked. Mirrors the CPU-side scheduler /// `BlockReason` shape. Currently only `WaitRegMem` — more to come in later /// phases (interrupts, timestamp waits). #[derive(Debug, Clone)] pub enum GpuBlock { WaitRegMem { poll_addr: u32, is_memory: bool, reference: u32, mask: u32, cmp: WaitCmp, }, } impl GpuBlock { /// Probe the wait condition. Returns `true` if the condition holds and /// the GPU should be unparked. pub fn is_satisfied(&self, mem: &dyn MemoryAccess, reg_file: &RegisterFile) -> bool { match self { GpuBlock::WaitRegMem { poll_addr, is_memory, reference, mask, cmp, } => { let value = if *is_memory { mem.read_u32(*poll_addr) } else { reg_file.read(*poll_addr) }; cmp.evaluate(value & *mask, *reference) } } } } /// Public notification the CP emits when the guest presents a frame. The /// kernel `vd_swap` path gates on this to push a `SwapInfo` to the host UI. #[derive(Debug, Clone, Copy, Default)] pub struct SwapNotification { pub frame_index: u64, pub frontbuffer_phys: u32, pub width: u32, pub height: u32, } /// GPU interrupt (fired by `PM4_INTERRUPT`). The kernel dispatches this to /// the guest callback registered by `VdSetGraphicsInterruptCallback`. #[derive(Debug, Clone, Copy)] pub struct PendingInterrupt { pub source: InterruptSource, pub cpu_mask: u32, } #[derive(Debug, Clone, Copy)] pub enum InterruptSource { CommandProcessor, Swap, } /// Per-run counters for observability. #[derive(Debug, Clone, Default)] pub struct GpuStats { pub packets_executed: u64, pub draws_seen: u64, pub swaps_seen: u64, pub interrupts_emitted: u64, pub wait_reg_mem_blocks: u64, pub indirect_buffer_jumps: u64, /// P4: count of EDRAM→memory resolves triggered by `TILE_FLUSH` events /// (event code 15). Non-zero means the game is committing rendered /// pixels to the frontbuffer / a texture. pub resolves_total: u64, /// Resolves whose byte copy path ran and wrote at least one sample to /// guest memory. Delta against `resolves_total` indicates how many /// resolves were skipped for an unsupported format / MSAA mode / 3D /// destination. pub resolves_copied_total: u64, /// Resolves that were skipped by [`crate::resolve::copy_to_memory`] due /// to an unsupported format path. Logged at `warn` so the reason is /// visible. pub resolves_skipped_total: u64, /// Total number of 32bpp samples written into guest memory across all /// successful resolves. Useful for sanity-checking that a big splash /// frame actually made it out (e.g. 1280×720 = 921_600 samples). pub resolve_samples_written: u64, /// P4: unique render-target keys seen (as managed by the internal /// `RenderTargetCache`). Useful HUD metric for multi-target workloads. pub unique_render_targets: u64, } /// Result of one packet step. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ExecOutcome { /// Consumed one packet; GPU remains Ready. Stepped { dwords_consumed: u32 }, /// Nothing to do right now. Idle, /// Parked on a sync primitive; `GpuSystem::pending_block` has details. Blocked, } /// Shader fetch constants: the game uploads them via `PM4_SET_CONSTANT` type=1 /// into a 256-dword region. Games then reference them by index when binding /// textures / vertex buffers. pub const CONST_BASE_ALU: u32 = 0x4000; pub const CONST_BASE_FETCH: u32 = 0x4800; pub const CONST_BASE_BOOL: u32 = 0x4900; pub const CONST_BASE_LOOP: u32 = 0x4908; pub const CONST_BASE_REGISTERS: u32 = 0x2000; /// Atomic mailbox for the handful of GPU registers that CROSS the MMIO /// boundary. Guests write into the `0x7FC80000` register aperture; those /// writes run through [`crate::mmio_region`] and land in these atomics. /// Inside `execute_one` / the scheduler's per-round GPU hook we sample them /// to sync `ring.write_offset_dwords`, reflect progress back to the guest, /// etc. /// /// Only these three registers need atomic cross-thread access. Everything /// else lives in [`GpuSystem::register_file`] which is CPU-thread-local. #[derive(Debug, Clone)] pub struct GpuMmio { /// `CP_RB_WPTR` — guest writes dword offset of the write pointer. pub cp_rb_wptr: Arc, /// `CP_RB_RPTR` — we mirror our internal `ring.read_offset_dwords` here /// so guests polling the register see progress. pub cp_rb_rptr: Arc, /// `CP_INT_STATUS` — bit set when an interrupt is pending. pub cp_int_status: Arc, /// `CP_INT_ACK` — guest clears the bit after handling. pub cp_int_ack: Arc, /// `D1MODE_VBLANK_VLINE_STATUS` (register `0x1951`, MMIO offset `0x6544`). /// Bit 0 = `VBLANK_INT_OCCURRED` — set by the GPU when vblank fires, /// cleared by the guest via write-1-to-clear. Sylpheed's vsync callback /// gates *all* its work on reading bit 0 as set: `lwz; rlwinm. r,r,0,31,31; /// bc 12,2,skip`. Without this bit toggling across vsyncs the callback /// always skips, so the PKEVENT that feeds the render dispatcher /// (user_data + 0x3B28) never gets signaled and the worker loops /// forever. pub d1mode_vblank_vline_status: Arc, /// M1.7 parker — set by producers (guest WPTR writes, shutdown) so /// the GPU worker thread does not park when work is pending. The /// worker swaps to `false` on entering its park decision and /// re-checks predicates; if a producer raced between the swap and /// the actual `park_timeout`, the producer's `unpark()` returns the /// park immediately via std's token semantics. Inline mode never /// reads this; the cost is one extra atomic store per WPTR write. pub wake_pending: Arc, /// Handle to the GPU worker thread, populated by `GpuWorker::run` on /// startup. The MMIO write callback for `CP_RB_WPTR` `unpark()`s it /// after every guest WPTR write so the worker proceeds without /// waiting for its `park_timeout`. `None` in inline mode (no worker /// to wake), in which case the unpark site is a one-mutex-lock /// no-op. pub worker_thread: Arc>>, } impl GpuMmio { pub fn new() -> Self { Self { cp_rb_wptr: Arc::new(AtomicU32::new(0)), cp_rb_rptr: Arc::new(AtomicU32::new(0)), cp_int_status: Arc::new(AtomicU32::new(0)), cp_int_ack: Arc::new(AtomicU32::new(0)), d1mode_vblank_vline_status: Arc::new(AtomicU32::new(0)), wake_pending: Arc::new(AtomicBool::new(false)), worker_thread: Arc::new(std::sync::Mutex::new(None)), } } } impl Default for GpuMmio { fn default() -> Self { Self::new() } } /// Live GPU system. One instance per `KernelState`. pub struct GpuSystem { pub register_file: RegisterFile, pub ring: RingBufferView, /// Stack of saved rings for `PM4_INDIRECT_BUFFER` nesting. The active /// ring is always `ring`; when an IB packet arrives, we push `ring` onto /// this stack and replace `ring` with the IB view. On IB completion /// (read pointer catches up to size), we pop. ib_stack: Vec, /// Cached shader blobs keyed by the raw CP register address that loaded them. pub shader_blobs: HashMap, /// P8 — FIFO of blob keys for bounded eviction. On `IM_LOAD*` the /// new key is pushed to the back; if the blob count exceeds /// [`SHADER_BLOB_CAP`], the front is popped and removed from /// `shader_blobs`. Prevents long-running guests from growing the /// cache without bound. The two *active* keys (`active_vs_key` + /// `active_ps_key`) are never evicted — safeguard in `evict_oldest`. pub shader_blob_order: std::collections::VecDeque, /// Monotonic frame counter (bumped on `PM4_XE_SWAP`). pub swap_counter: u64, /// Most recent swap notification; the kernel polls this after `execute_one` /// to decide whether to push a UI swap event. pub last_swap: Option, /// Queue of interrupts not yet delivered to the guest. Private so that /// callers go through [`Self::take_pending_interrupts`] — M1 step 6 /// then redirects this drain into a `crossbeam_channel::Sender` without /// re-touching every call site. pending_interrupts: Vec, /// Current stall reason, if any. pub pending_block: Option, pub stats: GpuStats, /// For the 64-bit bin mask/select we split hi/lo writes. pub bin_mask: u64, pub bin_select: u64, /// Shared-atomic mailbox for the `0x7FC80000` MMIO aperture. Cloned into /// the [`crate::mmio_region`] callbacks; clone-ownership keeps the bus /// side and the executor side in sync without locks. pub mmio: GpuMmio, /// Most recent draw state extracted at a `PM4_DRAW_INDX*` packet. The /// uber-shader pipeline in P3+ reads this to build its wgpu draw call. pub last_draw: Option, /// Most recent processed primitive — index rewrite + host topology /// decision. Separate from `last_draw` because its `rewritten_indices` /// may be large and callers may want to drop it after consumption. pub last_primitive: Option, /// Key in `shader_blobs` of the currently-active vertex shader. Set by /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kVertex` and read at /// `PM4_DRAW_INDX*` time so the host side can upload the matching /// microcode bytes before dispatching. pub active_vs_key: Option, /// Key in `shader_blobs` of the currently-active pixel shader. Set by /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kPixel`. pub active_ps_key: Option, /// P4: EDRAM tile-ownership bookkeeping + per-key RT descriptors. Updated /// at each `PM4_DRAW_INDX*` (`bind` + `claim_tiles`) and queried by /// `TILE_FLUSH` event handling to decide resolve sources. pub rt_cache: crate::render_target_cache::RenderTargetCache, /// P4: most recent `ResolveInfo` observed when `TILE_FLUSH` fired. The UI /// bridge surfaces this in the HUD so users can tell when a game is /// resolving to the frontbuffer versus an off-screen target. pub last_resolve: Option, /// P5: CPU-side decoded-texture cache (shared across draws within a /// frame; trimmed implicitly by insertion). `ensure_cached` hits this /// on every texture-fetch resolution; the UI thread sees the decoded /// bytes via `UiBridge::publish_texture`. pub texture_cache: crate::texture_cache::TextureCache, /// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and /// (future) host-render-target readback; read by the resolve byte-copy /// path that writes tiled pixels into guest memory. Allocated once at /// `GpuSystem::new` and lives for the whole GPU lifetime — no /// per-frame churn. pub edram: crate::edram::ShadowEdram, } impl GpuSystem { pub fn new() -> Self { Self { register_file: RegisterFile::new(), ring: RingBufferView::new(), ib_stack: Vec::new(), shader_blobs: HashMap::new(), shader_blob_order: std::collections::VecDeque::with_capacity(SHADER_BLOB_CAP + 1), swap_counter: 0, last_swap: None, pending_interrupts: Vec::new(), pending_block: None, stats: GpuStats::default(), bin_mask: 0, bin_select: 0, mmio: GpuMmio::new(), last_draw: None, last_primitive: None, active_vs_key: None, active_ps_key: None, rt_cache: crate::render_target_cache::RenderTargetCache::new(), last_resolve: None, texture_cache: crate::texture_cache::TextureCache::new(), edram: crate::edram::ShadowEdram::new(), } } /// P8 — insert a shader blob + bump the FIFO so long-running games /// don't grow `shader_blobs` without bound. Caps at [`SHADER_BLOB_CAP`]. /// Never evicts the currently-active VS/PS blobs (if they ended up at /// the front of the queue, we skip past them). fn insert_shader_blob(&mut self, key: u32, blob: ShaderBlob) { let already_present = self.shader_blobs.contains_key(&key); self.shader_blobs.insert(key, blob); if !already_present { self.shader_blob_order.push_back(key); metrics::counter!("gpu.shader.blob_seen").increment(1); } while self.shader_blobs.len() > SHADER_BLOB_CAP { // Pop the oldest key that isn't one of the active ones. let mut evicted = None; for _ in 0..self.shader_blob_order.len() { if let Some(candidate) = self.shader_blob_order.pop_front() { if Some(candidate) == self.active_vs_key || Some(candidate) == self.active_ps_key { self.shader_blob_order.push_back(candidate); continue; } self.shader_blobs.remove(&candidate); evicted = Some(candidate); break; } } if evicted.is_some() { metrics::counter!("gpu.shader.blob_evicted").increment(1); } else { // All remaining blobs are active — can't evict, stop. break; } } } /// P4: event dispatch helper called by every `PM4_EVENT_WRITE*` variant. /// `event_code` is the low 6 bits of the initiator word (see canary's /// `xenos::Event` enum — code 15 is `TILE_FLUSH`, the resolve trigger). /// /// Handle a `PM4_EVENT_WRITE*` initiator. For `TILE_FLUSH` (event 15) /// we decode the live `RB_*` register state into a [`ResolveInfo`], /// paint any clear values into the shadow EDRAM, and then copy bytes /// from the source render target into guest memory at /// `RB_COPY_DEST_BASE`. That last step is what `VdSwap` needs to see /// non-empty front-buffer pixels — see `memory/project_xenia_rs_edram /// _resolve_gap.md` for the history of this path. fn handle_event_initiator(&mut self, event_code: u32, mem: &dyn MemoryAccess) { const EVENT_TILE_FLUSH: u32 = 15; if event_code != EVENT_TILE_FLUSH { return; } let info = draw_state::ResolveInfo::from_register_file_and_memory( &self.register_file, mem, ); self.stats.resolves_total += 1; metrics::counter!( "gpu.resolve", "src" => format!("{}", info.copy_src_select), "fmt" => format!("{}", info.dest_format), "cmd" => format!("{}", info.copy_command), ) .increment(1); tracing::info!( src = info.copy_src_select, dst_base = format_args!("{:#010x}", info.dest_base), w = info.coords.width, h = info.coords.height, pitch = info.dest_pitch_pixels, fmt = info.dest_format, endian = info.dest_endian, clear_color = info.color_clear_enable, clear_depth = info.depth_clear_enable, "gpu: TILE_FLUSH resolve" ); // Paint clear values into the shadow EDRAM at the source tile // range *before* the copy. Games often issue a clear-then-resolve // as a single TILE_FLUSH: the EDRAM is filled with `RB_COLOR_CLEAR` // by the clear part, and that's what the copy part reads. // // Sample coordinates are pixel coordinates scaled up by // `sample_count_log2_x/y` — for 1x MSAA (Sylpheed) this is the // identity. if info.color_clear_enable && let draw_state::ResolveSource::Color(_) = info.source && info.surface_pitch_tiles > 0 { let sx = info.coords.x0 << info.coords.sample_count_log2_x; let sy = info.coords.y0 << info.coords.sample_count_log2_y; let sw = info.coords.width << info.coords.sample_count_log2_x; let sh = info.coords.height << info.coords.sample_count_log2_y; // 64bpp clears use both `RB_COLOR_CLEAR_LO` (low 32 bits) and // `RB_COLOR_CLEAR` (high 32 bits) per Canary `draw_util.cc:1302-1303`. // 32bpp clears ignore the lo word entirely. if info.source_is_64bpp { self.edram.fill_rect_64bpp( info.source_base_tiles, info.surface_pitch_tiles, sx, sy, sw, sh, info.color_clear_value_lo, info.color_clear_value, ); } else { self.edram.fill_rect_32bpp( info.source_base_tiles, info.surface_pitch_tiles, sx, sy, sw, sh, info.color_clear_value, ); } } if info.depth_clear_enable && info.surface_pitch_tiles > 0 { let sx = info.coords.x0 << info.coords.sample_count_log2_x; let sy = info.coords.y0 << info.coords.sample_count_log2_y; let sw = info.coords.width << info.coords.sample_count_log2_x; let sh = info.coords.height << info.coords.sample_count_log2_y; // Depth tiles live at RB_DEPTH_INFO.depth_base regardless of // which source this resolve selects. let rb_depth_info = self.register_file.read(draw_state::reg::RB_DEPTH_INFO); let depth_base = (rb_depth_info & 0xFFF) as u16; self.edram.fill_rect_32bpp( depth_base, info.surface_pitch_tiles, sx, sy, sw, sh, info.depth_clear_value, ); } // Byte copy into guest memory. let stats = crate::resolve::copy_to_memory(&info, &self.edram, mem); if stats.supported && stats.samples_written > 0 { self.stats.resolves_copied_total += 1; self.stats.resolve_samples_written += stats.samples_written as u64; } else if !stats.supported { self.stats.resolves_skipped_total += 1; } self.last_resolve = Some(info); } /// Sync state with the MMIO atomic mailbox. Call once at the top of the /// per-round GPU hook: the guest may have advanced `CP_RB_WPTR` since /// we last ran, and we in turn reflect our read-pointer back to the /// mirror register so the guest sees progress. /// /// GPUBUG-006: under `--parallel`, the producer (the guest CP_RB_WPTR /// MMIO write) uses `Release` to publish prior ring-memory writes; /// the consumer here must `Acquire`-load to pair correctly. With /// Relaxed-on-load, ring-memory writes that the guest performed /// before bumping WPTR could be reordered past our subsequent reads /// — leading to garbage PM4 packet contents. The producer side at /// `mmio_region.rs:78` already uses Release; the consumer's Relaxed /// was the missing half. Symmetrically, the RPTR mirror store /// publishes our read progress to the guest and benefits from a /// Release. pub fn sync_with_mmio(&mut self) { let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire); if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 { self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords; } // Mirror our read pointer (Release pairs with any guest-side // Acquire-load of CP_RB_RPTR for ring writeback bookkeeping). self.mmio .cp_rb_rptr .store(self.ring.read_offset_dwords, Ordering::Release); } /// True iff `execute_one` is expected to make progress without blocking. pub fn is_ready(&self, mem: &dyn MemoryAccess) -> bool { if let Some(block) = &self.pending_block { return block.is_satisfied(mem, &self.register_file); } self.ring.has_pending() } /// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when /// there's nothing to do, [`ExecOutcome::Blocked`] if a sync primitive /// stalls the GPU, otherwise [`ExecOutcome::Stepped`] with the number of /// dwords consumed (counting the header). pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome { // 0) If currently parked, probe the condition and either wake up or stay blocked. if let Some(block) = self.pending_block.clone() { if block.is_satisfied(mem, &self.register_file) { tracing::debug!(?block, "gpu: wait satisfied — resuming"); self.pending_block = None; } else { return ExecOutcome::Blocked; } } if !self.ring.has_pending() { // End of current ring. If we were inside an indirect buffer, pop // and resume the caller. if let Some(caller) = self.ib_stack.pop() { self.ring = caller; if self.ring.has_pending() { return self.execute_one(mem); } } return ExecOutcome::Idle; } let header_addr = self.ring.addr_at_offset(0).unwrap(); let header_word = mem.read_u32(header_addr); let packet = pm4::decode(header_word); tracing::trace!( header = format_args!("{header_word:#010x}"), addr = format_args!("{header_addr:#010x}"), ?packet.kind, "gpu: packet" ); let consumed = match packet.kind { PacketKind::Type0 { base_index, count, write_one } => { self.handle_type0(mem, base_index, count, write_one, packet.total_dwords) } PacketKind::Type1 { reg_index_1, reg_index_2 } => { self.handle_type1(mem, reg_index_1, reg_index_2) } PacketKind::Type2 => 1, PacketKind::Type3 { opcode, count, predicated, } => match self.handle_type3(mem, opcode, count, predicated, packet.total_dwords) { Type3Result::Consumed(n) => n, Type3Result::Blocked { rewind_to_header } => { // Re-park on this packet so the resume path re-reads it. if rewind_to_header { // We haven't moved read ptr yet, so this is a no-op — // documented to keep intent explicit. } return ExecOutcome::Blocked; } }, }; self.ring.advance_read(consumed); self.writeback_read_ptr(mem); self.stats.packets_executed += 1; ExecOutcome::Stepped { dwords_consumed: consumed, } } /// First-Pixels M2b — kernel-side commit point for `VdSwap`. Prior to /// M2b the kernel's [`vd_swap`] wrote a synthetic `PM4_XE_SWAP` packet /// at the guest-provided `buffer_ptr` + advanced our ring `wptr` by 64 /// dwords, expecting the drain to pick it up. That mechanism misaligned: /// the drain reads from `ring.base + rptr * 4` forward, not from the /// game's out-of-band `buffer_ptr`. 512 ring packets executed through /// 1 B guest instructions but `swaps_seen` stayed at 0. /// /// `VdSwap` is the kernel's commit point by definition — we don't need /// to launder the event through the ring. Call this directly from the /// kernel-side handler; the `PM4_XE_SWAP` opcode path still works for /// the (rare) case of a game that emits the packet through its own ring /// writes. pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) { self.stats.swaps_seen += 1; self.swap_counter = self.swap_counter.wrapping_add(1); self.last_swap = Some(SwapNotification { frame_index: self.swap_counter, frontbuffer_phys, width, height, }); self.pending_interrupts.push(PendingInterrupt { source: InterruptSource::Swap, cpu_mask: 0x1, }); tracing::info!( frame = self.swap_counter, fb = format_args!("{frontbuffer_phys:#010x}"), width, height, "gpu: XE_SWAP (kernel-direct)" ); } /// Called by `VdInitializeRingBuffer` to give us the primary ring. pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) { let size_bytes = 1u32 << size_log2.min(31); self.ring.base = base; self.ring.size_dwords = size_bytes / 4; self.ring.read_offset_dwords = 0; // `write_offset` is driven by the guest — start at 0 so the ring // appears empty until MMIO writes advance it. self.ring.write_offset_dwords = 0; tracing::info!( base = format_args!("{base:#010x}"), size_bytes, size_dwords = self.ring.size_dwords, "gpu: ring initialized" ); } /// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest /// expects us to mirror `read_offset_dwords`. pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) { self.ring.rptr_writeback_addr = addr; self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31); tracing::info!( addr = format_args!("{addr:#010x}"), block_dwords = self.ring.rptr_writeback_block_dwords, "gpu: rptr writeback enabled" ); } /// Drain the pending-interrupt queue. The kernel calls this once per /// scheduler round and queues each entry into `interrupts.queue_interrupt`. /// /// M1 step 6 swaps the `Vec`-backed implementation for a /// `crossbeam_channel::Sender`. Routing every external /// reader through this single accessor in step 2 means that swap is a /// localized change — no call site changes. /// /// Returns the previously-queued interrupts and leaves the internal queue /// empty. Cheap (`Vec::take`); no allocation when the queue is already /// empty. pub fn take_pending_interrupts(&mut self) -> Vec { std::mem::take(&mut self.pending_interrupts) } /// True when the pending-interrupt queue has at least one entry. Used /// by callers that want to short-circuit an empty drain (saving the /// `Vec::new()` allocation that `take` would otherwise force on every /// scheduler round). pub fn has_pending_interrupts(&self) -> bool { !self.pending_interrupts.is_empty() } /// Extend the logical write pointer by `dwords` (cumulative). `VdSwap` /// reserves 64 dwords then calls this; MMIO writes to `CP_RB_WPTR` will /// do the same in P2+. pub fn extend_write_ptr(&mut self, new_wptr_dwords: u32) { if self.ring.size_dwords == 0 { return; } self.ring.write_offset_dwords = new_wptr_dwords % self.ring.size_dwords; } /// Write the current read pointer back to the guest-registered /// address. M1.8 uses the fenced variant: when the GPU runs on its /// own host thread, the CPU can poll this RPTR mirror to learn how /// far the GPU has consumed the ring; the Release fence ensures any /// upstream packet effects (memory writes, register file updates /// the guest reads via subsequent MMIO) happen-before the /// CPU-visible RPTR bump. fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) { if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() { mem.write_u32_fence( self.ring.rptr_writeback_addr, self.ring.read_offset_dwords, ); } } // ── Type-0/1 handlers ───────────────────────────────────────────────── fn handle_type0( &mut self, mem: &dyn MemoryAccess, base_index: u32, count: u32, write_one: bool, total_dwords: u32, ) -> u32 { for i in 0..count { let dword_addr = self.ring.addr_at_offset(1 + i).unwrap(); let value = mem.read_u32(dword_addr); let target = if write_one { base_index } else { base_index + i }; self.register_file.write(target, value); } tracing::trace!( base = format_args!("{base_index:#x}"), count, write_one, "gpu: Type0 reg write run" ); total_dwords } fn handle_type1( &mut self, mem: &dyn MemoryAccess, reg_index_1: u32, reg_index_2: u32, ) -> u32 { let a_addr = self.ring.addr_at_offset(1).unwrap(); let b_addr = self.ring.addr_at_offset(2).unwrap(); let a = mem.read_u32(a_addr); let b = mem.read_u32(b_addr); self.register_file.write(reg_index_1, a); self.register_file.write(reg_index_2, b); tracing::trace!( r1 = format_args!("{reg_index_1:#x}"), r2 = format_args!("{reg_index_2:#x}"), "gpu: Type1 dual reg write" ); 3 } // ── Type-3 dispatch ─────────────────────────────────────────────────── fn handle_type3( &mut self, mem: &dyn MemoryAccess, opcode: u8, count: u32, predicated: bool, total_dwords: u32, ) -> Type3Result { metrics::counter!("gpu.packet", "opcode" => pm4::type3_opcode_name(opcode)).increment(1); tracing::trace!( opcode = format_args!("{opcode:#x}"), name = pm4::type3_opcode_name(opcode), count, predicated, "gpu: Type3" ); // If predicated and the bin mask/select combo evaluates to "skip", // consume the whole packet (including data dwords) and move on. We // don't emulate binning so bin_mask & bin_select is always 0 → we // keep predicated packets in simplest form: execute them anyway. Most // games don't use binning on Xenos. Observed in canary: // `pm4_command_processor_implement.h:440-460`. let _ = predicated; match opcode { pm4::PM4_NOP | pm4::PM4_WAIT_FOR_IDLE | pm4::PM4_CONTEXT_UPDATE | pm4::PM4_INVALIDATE_STATE | pm4::PM4_ME_INIT | pm4::PM4_VIZ_QUERY | pm4::PM4_SET_SHADER_BASES => { // Classify-and-skip. State side effects (if any) are deferred. } pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => { self.stats.indirect_buffer_jumps += 1; let ib_ptr = self.read_payload(mem, 1); let ib_size = self.read_payload(mem, 2); // Advance past the IB header + payload before recursing so // the return location is correct. self.ring.advance_read(total_dwords); self.writeback_read_ptr(mem); // Push current ring, switch to IB view. let caller = self.ring; self.ib_stack.push(caller); self.ring = RingBufferView { base: ib_ptr & !3, size_dwords: ib_size, read_offset_dwords: 0, write_offset_dwords: ib_size, // IB is fully-written at jump time rptr_writeback_addr: 0, rptr_writeback_block_dwords: 0, }; tracing::debug!( ib_ptr = format_args!("{ib_ptr:#010x}"), ib_size, "gpu: jump to indirect buffer" ); return Type3Result::Consumed(0); // we already advanced } pm4::PM4_WAIT_REG_MEM => { // Canary layout (pm4_command_processor_implement.h:699-755): // payload[0] = wait_info (bit 4 = is_memory, bits 2:0 = cmp) // payload[1] = poll address (register idx OR memory addr; bottom 2 bits = endian for memory) // payload[2] = ref value // payload[3] = mask // payload[4] = wait (sleep hint, ignored) let wait_info = self.read_payload(mem, 1); let poll_addr_raw = self.read_payload(mem, 2); let reference = self.read_payload(mem, 3); let mask = self.read_payload(mem, 4); let is_memory = (wait_info & 0x10) != 0; let cmp = WaitCmp::from_wait_info(wait_info); let poll_addr = if is_memory { poll_addr_raw & !3 } else { poll_addr_raw }; let block = GpuBlock::WaitRegMem { poll_addr, is_memory, reference, mask, cmp, }; if block.is_satisfied(mem, &self.register_file) { // Condition already true; proceed past this packet. tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied"); } else { self.stats.wait_reg_mem_blocks += 1; tracing::debug!(?block, "gpu: WAIT_REG_MEM parking"); self.pending_block = Some(block); return Type3Result::Blocked { rewind_to_header: true }; } } pm4::PM4_REG_RMW => { // payload[0] = rmw_info (bit 31 = and_from_reg, bit 30 = or_from_reg) // payload[1] = and mask (or register index) // payload[2] = or mask (or register index) let rmw_info = self.read_payload(mem, 1); let and_or_reg = (rmw_info & 0x8000_0000) != 0; let or_from_reg = (rmw_info & 0x4000_0000) != 0; let reg_index = rmw_info & 0x1FFF; let p2 = self.read_payload(mem, 2); let p3 = self.read_payload(mem, 3); let and_mask = if and_or_reg { self.register_file.read(p2 & 0x1FFF) } else { p2 }; let or_mask = if or_from_reg { self.register_file.read(p3 & 0x1FFF) } else { p3 }; let cur = self.register_file.read(reg_index); let new_value = (cur & and_mask) | or_mask; self.register_file.write(reg_index, new_value); tracing::trace!( reg = format_args!("{reg_index:#x}"), cur = format_args!("{cur:#x}"), new = format_args!("{new_value:#x}"), "gpu: REG_RMW" ); } pm4::PM4_REG_TO_MEM => { // payload[0] = reg_index, payload[1] = mem addr let reg_index = self.read_payload(mem, 1) & 0x1FFF; let dst = self.read_payload(mem, 2) & !3; let value = self.register_file.read(reg_index); mem.write_u32(dst, value); tracing::trace!( reg = format_args!("{reg_index:#x}"), dst = format_args!("{dst:#010x}"), value = format_args!("{value:#x}"), "gpu: REG_TO_MEM" ); } pm4::PM4_MEM_WRITE => { // payload[0] = dst, payload[1..=count-1] = values let mut dst = self.read_payload(mem, 1) & !3; for i in 2..=count { let val = self.read_payload(mem, i); mem.write_u32(dst, val); dst = dst.wrapping_add(4); } } pm4::PM4_COND_WRITE => { // payload[0] = wait_info, [1] = poll addr, [2] = ref, [3] = mask, // [4] = write addr/reg, [5] = write data let wait_info = self.read_payload(mem, 1); let poll_raw = self.read_payload(mem, 2); let reference = self.read_payload(mem, 3); let mask = self.read_payload(mem, 4); let is_memory = (wait_info & 0x10) != 0; let cmp = WaitCmp::from_wait_info(wait_info); let poll_addr = if is_memory { poll_raw & !3 } else { poll_raw }; let cur_raw = if is_memory { mem.read_u32(poll_addr) } else { self.register_file.read(poll_addr) }; if cmp.evaluate(cur_raw & mask, reference) { let write_addr = self.read_payload(mem, 5); let write_data = self.read_payload(mem, 6); if (wait_info & 0x100) != 0 { mem.write_u32(write_addr & !3, write_data); } else { self.register_file .write(write_addr & 0x1FFF, write_data); } } } pm4::PM4_EVENT_WRITE => { // payload[0] = initiator (written to VGT_EVENT_INITIATOR). let initiator = self.read_payload(mem, 1); self.register_file .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); self.handle_event_initiator(initiator & 0x3F, mem); tracing::trace!(initiator = format_args!("{:#x}", initiator & 0x3F), "gpu: EVENT_WRITE"); } pm4::PM4_EVENT_WRITE_SHD => { // payload[0] = initiator (bit 31: write counter, else write `value`) // payload[1] = address, payload[2] = value let initiator = self.read_payload(mem, 1); let address = self.read_payload(mem, 2); let value = self.read_payload(mem, 3); self.register_file .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); self.handle_event_initiator(initiator & 0x3F, mem); let data = if (initiator & 0x8000_0000) != 0 { self.swap_counter as u32 } else { value }; // M1.8: fenced write. The CPU thread busy-polls this // address as a GPU completion fence. The Release fence // emitted here pairs with `read_u32_fence`'s Acquire on // the polling side: any earlier writes the worker // performed (RPTR writeback, resolve target writes, // etc.) are visible to the CPU once it sees the new // fence value. mem.write_u32_fence(address & !3, data); tracing::trace!( addr = format_args!("{:#010x}", address & !3), value = format_args!("{data:#x}"), "gpu: EVENT_WRITE_SHD" ); } pm4::PM4_EVENT_WRITE_EXT => { // payload[0] = initiator, [1] = address. Writes 6 u16 extents // (min/max x/y/z) — we're not tracking scissors yet, so write zeros. let initiator = self.read_payload(mem, 1); let address = self.read_payload(mem, 2) & !3; self.register_file .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); self.handle_event_initiator(initiator & 0x3F, mem); for i in 0..6u32 { mem.write_u16(address + i * 2, 0); } } pm4::PM4_EVENT_WRITE_ZPD => { // Occlusion query writeback — always write zeros (no query). let initiator = self.read_payload(mem, 1); self.register_file .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F); self.handle_event_initiator(initiator & 0x3F, mem); } pm4::PM4_DRAW_INDX | pm4::PM4_DRAW_INDX_2 => { self.stats.draws_seen += 1; // Canary (`pm4_command_processor_implement.h:1128-1151`): // DRAW_INDX: payload[0] = viz_query, payload[1] = vgt_draw_initiator, // [2] = dma_base (if source=DMA), [3] = dma_size // DRAW_INDX_2: payload[0] = vgt_draw_initiator (indices follow inline). let (vgt, dma_base, dma_size) = if opcode == pm4::PM4_DRAW_INDX { let _viz = self.read_payload(mem, 1); let vgt = self.read_payload(mem, 2); let (db, ds) = if count >= 4 { (Some(self.read_payload(mem, 3)), Some(self.read_payload(mem, 4))) } else { (None, None) }; (vgt, db, ds) } else { (self.read_payload(mem, 1), None, None) }; let mut ds = draw_state::extract(&self.register_file, vgt, dma_base, dma_size); ds.vs_blob_key = self.active_vs_key; ds.ps_blob_key = self.active_ps_key; let processed = primitive::process(ds.primitive, ds.vertex_count, None); metrics::counter!( "gpu.draw", "prim" => format!("{:?}", ds.primitive), ) .increment(1); if processed.rejected { metrics::counter!("gpu.draw.rejected").increment(1); } // P4: update the render-target cache with every bound RT // from this draw. Each bind either inserts a new key or // refreshes an existing descriptor's bind_count. `msaa` is // still hardcoded to 1× because we don't yet decode // `PA_SC_AA_CONFIG`; P4b can add that. let msaa = crate::render_target_cache::MsaaSamples::X1; let mut viewport_height = ds.viewport.scale_y.abs() * 2.0; if viewport_height <= 0.0 { viewport_height = 720.0; } // 16 samples per tile row (64-sample 8×8 macroblocks pack // 16 vertical samples per EDRAM tile). let rows_of_tiles = (viewport_height as u32).div_ceil(16); for (i, ci_opt) in ds.color_info.iter().enumerate() { if let Some(ci) = ci_opt { let pitch32 = ds.scissor.br_x.div_ceil(32); let key = crate::render_target_cache::RenderTargetKey { base_tiles: ci.base_tiles, pitch_tiles_at_32bpp: pitch32, msaa_samples: msaa, is_depth: false, resource_format: ci.format & 0xF, }; let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32); self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16); metrics::counter!( "gpu.rt.bind", "slot" => format!("{i}"), "fmt" => format!("{}", ci.format & 0xF), ) .increment(1); } } if let Some(depth) = ds.depth_info { let pitch32 = ds.scissor.br_x.div_ceil(32); let key = crate::render_target_cache::RenderTargetKey { base_tiles: depth.base_tiles, pitch_tiles_at_32bpp: pitch32, msaa_samples: msaa, is_depth: true, resource_format: depth.format & 0xF, }; let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32); self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16); } self.stats.unique_render_targets = self.rt_cache.len() as u64; tracing::debug!( opcode = format_args!("{opcode:#x}"), prim = ?ds.primitive, verts = ds.vertex_count, ?processed.topology, rewritten = processed.rewritten_indices.is_some(), "gpu: DRAW_INDX captured" ); self.last_draw = Some(ds); self.last_primitive = Some(processed); } pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => { // payload[0] = offset_type — bits[10:0] index, bits[23:16] type let offset_type = self.read_payload(mem, 1); let index = offset_type & 0x7FF; let const_type = (offset_type >> 16) & 0xFF; let base = match const_type { 0 => CONST_BASE_ALU, 1 => CONST_BASE_FETCH, 2 => CONST_BASE_BOOL, 3 => CONST_BASE_LOOP, 4 => CONST_BASE_REGISTERS, _ => CONST_BASE_ALU, // defensive default }; for i in 0..(count - 1) { let v = self.read_payload(mem, 2 + i); self.register_file.write(base + index + i, v); } } pm4::PM4_SET_CONSTANT2 => { // payload[0] = 16-bit index; subsequent payloads write consecutive regs. let index = self.read_payload(mem, 1) & 0xFFFF; for i in 0..(count - 1) { let v = self.read_payload(mem, 2 + i); self.register_file.write(index + i, v); } } pm4::PM4_LOAD_ALU_CONSTANT => { // payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords let src = self.read_payload(mem, 1) & !3; let offset_type = self.read_payload(mem, 2); let size_dwords = self.read_payload(mem, 3); let index = offset_type & 0x7FF; let const_type = (offset_type >> 16) & 0xFF; let base = match const_type { 0 => CONST_BASE_ALU, 1 => CONST_BASE_FETCH, 2 => CONST_BASE_BOOL, 3 => CONST_BASE_LOOP, 4 => CONST_BASE_REGISTERS, _ => CONST_BASE_ALU, }; for i in 0..size_dwords { let v = mem.read_u32(src + i * 4); self.register_file.write(base + index + i, v); } } pm4::PM4_IM_LOAD | pm4::PM4_IM_LOAD_IMMEDIATE => { // Canary (pm4_command_processor_implement.h:1271-1330): // IM_LOAD payload: [0] addr_type, [1] start_size // IM_LOAD_IMMEDIATE payload: [0] shader_type, [1] start_size, [2..count] = microcode let shader_type = self.read_payload(mem, 1) as u8 & 0x3; let start_size = self.read_payload(mem, 2); let size_dwords = start_size & 0xFFFF; let blob = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE { let mut v = Vec::with_capacity(size_dwords as usize); for i in 0..size_dwords { v.push(self.read_payload(mem, 3 + i)); } v } else { let addr = self.read_payload(mem, 1) & !3; let mut v = Vec::with_capacity(size_dwords as usize); for i in 0..size_dwords { v.push(mem.read_u32(addr + i * 4)); } v }; // For IM_LOAD the payload already carries an address that // uniquely identifies this shader in guest memory, so the // full `addr_type` dword (address | stage bits) makes a // good cache key. For IM_LOAD_IMMEDIATE payload[0] is just // the 2-bit shader_type — without a content-derived key // every immediate upload would collide on 0 or 1 and // thrash a single slot. Fold the microcode through a // stable FNV-1a hash so per-content dedup still works. let key = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE { fnv1a_u32_dwords(shader_type as u32, &blob) } else { self.read_payload(mem, 1) }; self.insert_shader_blob( key, ShaderBlob { shader_type, dwords: blob, }, ); // P3b M1: record which blob is now "active" for the // current stage. The uber-shader dispatch (xenia-ui) reads // `active_vs_key`/`active_ps_key` at draw time to upload // the right microcode. `shader_type`: 0 = vertex, 1 = pixel // (per Xenos `ShaderType`). match shader_type { 0 => self.active_vs_key = Some(key), 1 => self.active_ps_key = Some(key), _ => {} } metrics::counter!( "gpu.shader.blob_seen", "stage" => if shader_type == 0 { "vs" } else { "ps" }, ) .increment(1); tracing::debug!( shader_type, size_dwords, key = format_args!("{key:#x}"), "gpu: IM_LOAD (shader blob cached)" ); } pm4::PM4_SET_BIN_MASK_LO => { self.bin_mask = (self.bin_mask & 0xFFFF_FFFF_0000_0000) | (self.read_payload(mem, 1) as u64); } pm4::PM4_SET_BIN_MASK_HI => { self.bin_mask = (self.bin_mask & 0x0000_0000_FFFF_FFFF) | ((self.read_payload(mem, 1) as u64) << 32); } pm4::PM4_SET_BIN_MASK => { let lo = self.read_payload(mem, 1) as u64; let hi = self.read_payload(mem, 2) as u64; self.bin_mask = (hi << 32) | lo; } pm4::PM4_SET_BIN_SELECT_LO => { self.bin_select = (self.bin_select & 0xFFFF_FFFF_0000_0000) | (self.read_payload(mem, 1) as u64); } pm4::PM4_SET_BIN_SELECT_HI => { self.bin_select = (self.bin_select & 0x0000_0000_FFFF_FFFF) | ((self.read_payload(mem, 1) as u64) << 32); } pm4::PM4_SET_BIN_SELECT => { let lo = self.read_payload(mem, 1) as u64; let hi = self.read_payload(mem, 2) as u64; self.bin_select = (hi << 32) | lo; } pm4::PM4_INTERRUPT => { let cpu_mask = self.read_payload(mem, 1); self.stats.interrupts_emitted += 1; self.pending_interrupts.push(PendingInterrupt { source: InterruptSource::CommandProcessor, cpu_mask, }); tracing::debug!( cpu_mask = format_args!("{cpu_mask:#x}"), "gpu: PM4_INTERRUPT queued" ); } pm4::PM4_XE_SWAP => { // Payload: [0] signature, [1] frontbuffer_phys, [2] width, [3] height let _signature = self.read_payload(mem, 1); let frontbuffer_phys = self.read_payload(mem, 2); let width = self.read_payload(mem, 3); let height = self.read_payload(mem, 4); self.notify_xe_swap(frontbuffer_phys, width, height); } _ => { // Unknown opcode — log once per opcode but don't stall. tracing::warn!( opcode = format_args!("{opcode:#x}"), count, "gpu: unhandled Type3 opcode" ); } } Type3Result::Consumed(total_dwords) } /// Read dword at payload-relative offset `i` (where `i=0` is the header). fn read_payload(&self, mem: &dyn MemoryAccess, i: u32) -> u32 { let addr = self.ring.addr_at_offset(i).unwrap(); mem.read_u32(addr) } /// Drain up to `max_packets` (used by the kernel's VdSwap handler when we /// don't yet have MMIO-triggered draining). Returns the number of /// packets consumed. pub fn drain(&mut self, mem: &dyn MemoryAccess, max_packets: u32) -> u32 { let mut n = 0; for _ in 0..max_packets { match self.execute_one(mem) { ExecOutcome::Stepped { .. } => n += 1, ExecOutcome::Idle | ExecOutcome::Blocked => break, } } n } /// Drain until the ring's read offset reaches `target_wptr` (modulo ring /// size) or `execute_one` returns Idle/Blocked. Mirrors canary's /// `WorkerThreadMain` (xenia-canary `command_processor.cc` ExecutePrimaryBuffer) /// which loops on `read_ptr_index_ != write_ptr_index` with no packet /// budget. `time_budget` bounds wall-clock so a pathological packet /// (e.g. an EVENT_WRITE that perpetually re-blocks) cannot spin the /// inline path; pass 900 ms to match the threaded `DrainFence` deadline. /// Returns the number of packets consumed. pub fn drain_until_wptr( &mut self, mem: &dyn MemoryAccess, target_wptr: u32, time_budget: Duration, ) -> u32 { if self.ring.size_dwords == 0 { return 0; } let target = target_wptr % self.ring.size_dwords; let deadline = Instant::now() + time_budget; let mut n = 0u32; while self.ring.read_offset_dwords != target { if Instant::now() >= deadline { // Deadline exhaustion is the *expected* outcome under // `--parallel` workloads (Sylpheed boot queues millions // of game-batched IBs the inline drain can't chew // through in 900 ms). Logged at debug because warn-level // would fire on every vd_swap. Callers can re-read the // ring read pointer to detect partial drain if they // care. tracing::debug!( target, rptr = self.ring.read_offset_dwords, consumed = n, "gpu: drain_until_wptr time-budget exhausted" ); break; } match self.execute_one(mem) { ExecOutcome::Stepped { .. } => { n += 1; // Mirror the threaded `DrainFence` handler at // handle.rs:553-570: re-sync after every packet so // any concurrent guest WPTR write (under `--parallel`) // folds into the local ring view before the next // `is_ready` check. Without this the local // write_offset is a snapshot of the moment we entered // the drain, which is fine for a target-WPTR drain // but wrong if downstream packets (e.g. an indirect // buffer's nested ring) need an updated view. self.sync_with_mmio(); } ExecOutcome::Idle | ExecOutcome::Blocked => break, } } n } } impl Default for GpuSystem { fn default() -> Self { Self::new() } } /// Subset of Xenos registers we reference by name. Full table at /// `xenia-canary/src/xenia/gpu/registers.h`. pub mod reg { //! All values below are Xenos *register indices* (the number you find in //! canary's `register_table.inc`, i.e. the byte offset within the //! aperture divided by 4). The MMIO aperture at `0x7FC8_0000` maps each //! register at `APERTURE_BASE + index * 4`; the MMIO callbacks recover //! the index with `(addr & 0xFFFF) / 4` before matching against these //! constants. /// `XE_GPU_REG_CP_RB_BASE` — ring buffer base address. pub const CP_RB_BASE: u32 = 0x01C0; /// `XE_GPU_REG_CP_RB_CNTL` — ring buffer control. pub const CP_RB_CNTL: u32 = 0x01C1; /// `XE_GPU_REG_CP_RB_RPTR_ADDR` — where to mirror read pointer. pub const CP_RB_RPTR_ADDR: u32 = 0x01C3; /// `XE_GPU_REG_CP_RB_RPTR` — read pointer (GPU → CPU). pub const CP_RB_RPTR: u32 = 0x01C4; /// `XE_GPU_REG_CP_RB_WPTR` — write pointer (CPU → GPU); MMIO side effect. pub const CP_RB_WPTR: u32 = 0x01C5; /// `XE_GPU_REG_CP_INT_STATUS` — interrupt status bits. pub const CP_INT_STATUS: u32 = 0x01F3; /// `XE_GPU_REG_CP_INT_ACK` — write-to-ack interrupt bits. pub const CP_INT_ACK: u32 = 0x01F4; /// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126). /// Bit 0 = VBLANK_INT_OCCURRED. pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951; /// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE. pub const VGT_EVENT_INITIATOR: u32 = 0x21F9; /// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits /// (Canary `register_table.inc:530`). pub const COHER_STATUS_HOST: u32 = 0x0A31; } /// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a /// stable, collision-resistant cache key for `IM_LOAD_IMMEDIATE` shader /// blobs (where the guest supplies no natural address to key on). fn fnv1a_u32_dwords(seed: u32, dwords: &[u32]) -> u32 { const FNV_OFFSET: u32 = 0x811C_9DC5; const FNV_PRIME: u32 = 0x0100_0193; let mut hash = FNV_OFFSET; for byte in seed.to_le_bytes() { hash ^= byte as u32; hash = hash.wrapping_mul(FNV_PRIME); } for dw in dwords { for byte in dw.to_le_bytes() { hash ^= byte as u32; hash = hash.wrapping_mul(FNV_PRIME); } } hash } /// Internal Type-3 handler result. Distinguishes "consumed a packet (by N /// dwords)" from "blocked; don't advance read ptr". enum Type3Result { Consumed(u32), Blocked { rewind_to_header: bool }, } #[cfg(test)] mod tests { use super::*; use xenia_memory::GuestMemory; use xenia_memory::page_table::MemoryProtect; fn build_mem() -> GuestMemory { let mut mem = GuestMemory::new().unwrap(); let rw = MemoryProtect::READ | MemoryProtect::WRITE; mem.alloc(0x4000_0000, 0x4000, rw).unwrap(); mem } #[test] fn ready_when_ring_has_pending() { let mut gpu = GpuSystem::new(); let mem = build_mem(); assert!(!gpu.is_ready(&mem)); gpu.initialize_ring_buffer(0x4000_0000, 10); // 1024 bytes = 256 dwords assert!(!gpu.is_ready(&mem)); gpu.extend_write_ptr(4); assert!(gpu.is_ready(&mem)); } #[test] fn type2_nop_advances_read_pointer() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); // 256 dwords ring at 0x40000000 gpu.initialize_ring_buffer(0x4000_0000, 10); // Push 3 Type-2 NOPs for i in 0..3u32 { mem.write_u32(0x4000_0000 + i * 4, 0x8000_0000); } gpu.extend_write_ptr(3); for _ in 0..3 { match gpu.execute_one(&mut mem) { ExecOutcome::Stepped { dwords_consumed } => assert_eq!(dwords_consumed, 1), other => panic!("unexpected {:?}", other), } } assert_eq!(gpu.ring.read_offset_dwords, 3); assert_eq!(gpu.stats.packets_executed, 3); } #[test] fn type0_reg_run_writes_register_file() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // Type0 header: 2 dwords, base_index=0x100 → write_one=0 → count field = 1 (count-1) let hdr = (1u32 << 16) | 0x100; mem.write_u32(0x4000_0000, hdr); mem.write_u32(0x4000_0004, 0xDEAD_BEEF); mem.write_u32(0x4000_0008, 0xCAFE_BABE); gpu.extend_write_ptr(3); assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); assert_eq!(gpu.register_file.read(0x100), 0xDEAD_BEEF); assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE); } #[test] fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // WAIT_REG_MEM: wait until *0x40001000 == 0x42 // header let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8); mem.write_u32(0x4000_0000, hdr); // wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 2) mem.write_u32(0x4000_0004, 0x12); mem.write_u32(0x4000_0008, 0x4000_1000); mem.write_u32(0x4000_000C, 0x42); mem.write_u32(0x4000_0010, 0xFFFF_FFFF); mem.write_u32(0x4000_0014, 0); gpu.extend_write_ptr(6); // First exec: poll addr reads 0 → blocked. assert_eq!(gpu.execute_one(&mut mem), ExecOutcome::Blocked); assert_eq!(gpu.ring.read_offset_dwords, 0, "read ptr must not advance while blocked"); // Make the wait satisfied. mem.write_u32(0x4000_1000, 0x42); match gpu.execute_one(&mut mem) { ExecOutcome::Stepped { dwords_consumed } => { // The WAIT_REG_MEM packet is 1 (header) + 5 (count) = 6 dwords. assert_eq!(dwords_consumed, 6); } other => panic!("expected Stepped after wait satisfied, got {:?}", other), } assert_eq!(gpu.ring.read_offset_dwords, 6); } #[test] fn mem_write_writes_all_payload_dwords() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // MEM_WRITE: count=3 → 1 header + 1 dst + 2 data let hdr = (3u32 << 30) | ((3u32 - 1) << 16) | ((pm4::PM4_MEM_WRITE as u32) << 8); mem.write_u32(0x4000_0000, hdr); mem.write_u32(0x4000_0004, 0x4000_1000); // dst mem.write_u32(0x4000_0008, 0x1111_1111); mem.write_u32(0x4000_000C, 0x2222_2222); gpu.extend_write_ptr(4); assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); assert_eq!(mem.read_u32(0x4000_1000), 0x1111_1111); assert_eq!(mem.read_u32(0x4000_1004), 0x2222_2222); } #[test] fn mmio_write_to_cp_rb_wptr_reflects_into_ring() { use std::sync::atomic::Ordering; let mut gpu = GpuSystem::new(); let mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // Guest writes wptr=8 via MMIO. gpu.mmio.cp_rb_wptr.store(8, Ordering::Relaxed); // Before sync, ring has no pending work. assert!(!gpu.is_ready(&mem)); gpu.sync_with_mmio(); assert_eq!(gpu.ring.write_offset_dwords, 8); assert!(gpu.is_ready(&mem)); // After sync, rptr is mirrored back to mmio for the guest to read. assert_eq!(gpu.mmio.cp_rb_rptr.load(Ordering::Relaxed), 0); } /// End-to-end: feed two DRAW_INDX_2 packets through `execute_one` and /// verify the GPU system reports the expected `draws_seen` / `last_draw` /// state that the UI's Xenos pipeline consumes. Acts as the "draw /// dispatch integration" check mentioned in the P3 verification plan. #[test] fn successive_draws_accumulate_in_stats() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); let mk_draw = |addr: u32, vgt: u32, mem: &xenia_memory::GuestMemory| { let hdr = (3u32 << 30) | ((1u32 - 1) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); mem.write_u32(addr, hdr); mem.write_u32(addr + 4, vgt); }; // Draw #1: TriangleList, 6 verts. mk_draw(0x4000_0000, (6u32 << 16) | (2 << 6) | 4, &mut mem); // Draw #2: TriangleStrip, 4 verts. mk_draw(0x4000_0008, (4u32 << 16) | (2 << 6) | 6, &mut mem); gpu.extend_write_ptr(4); assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); assert_eq!(gpu.stats.draws_seen, 2); let ds = gpu.last_draw.expect("last_draw set"); assert_eq!(ds.primitive, crate::draw_state::PrimitiveType::TriangleStrip); assert_eq!(ds.vertex_count, 4); } #[test] fn draw_indx_2_captures_last_draw() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // DRAW_INDX_2 packet with 1 payload dword = vgt_draw_initiator: // prim=4 (TriangleList), source=2 (auto), count=3 verts. let vgt = (3u32 << 16) | (2 << 6) | 4; let hdr = (3u32 << 30) | ((1u32 - 0) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); // count field in header = (total_dwords_after - 1); we have 1 payload = count=1 → encoded 0. let hdr = (hdr & !0x3FFF_0000) | ((1u32 - 1) << 16); mem.write_u32(0x4000_0000, hdr); mem.write_u32(0x4000_0004, vgt); gpu.extend_write_ptr(2); assert!(matches!( gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. } )); assert_eq!(gpu.stats.draws_seen, 1); let ds = gpu.last_draw.expect("last_draw set"); assert_eq!( ds.primitive, crate::draw_state::PrimitiveType::TriangleList ); assert_eq!(ds.vertex_count, 3); let p = gpu.last_primitive.as_ref().expect("last_primitive set"); assert_eq!(p.topology, crate::primitive::HostTopology::TriangleList); assert!(!p.rejected); } /// P3b M1: IM_LOAD_IMMEDIATE must set `active_vs_key` / `active_ps_key` /// based on `shader_type`, and a subsequent DRAW_INDX must carry those /// P8: shader-blob FIFO evicts the oldest non-active blob when the /// cache crosses `SHADER_BLOB_CAP`. Active keys are protected. #[test] fn shader_blob_cap_evicts_oldest() { let mut gpu = GpuSystem::new(); gpu.active_vs_key = Some(u32::MAX); // Insert unique keys (starting at 1_000 to avoid colliding with // the active-key sentinel) up to `CAP + 10`; every insert fires // the eviction path once len > CAP. gpu.insert_shader_blob( u32::MAX, ShaderBlob { shader_type: 0, dwords: vec![0xAA; 4], }, ); let first_key = 1_000u32; for k in first_key..(first_key + SHADER_BLOB_CAP as u32 + 10) { gpu.insert_shader_blob( k, ShaderBlob { shader_type: 0, dwords: vec![k; 2], }, ); } assert!(gpu.shader_blobs.len() <= SHADER_BLOB_CAP); // Active key (u32::MAX) must still be present. assert!(gpu.shader_blobs.contains_key(&u32::MAX)); // Earliest non-active key must have been evicted (at least one of // the first 10 we inserted is gone). let evicted = (first_key..first_key + 10) .filter(|k| !gpu.shader_blobs.contains_key(k)) .count(); assert!( evicted > 0, "expected at least one of the first 10 keys to be evicted, \ got shader_blobs.len() = {}", gpu.shader_blobs.len() ); } /// `IM_LOAD_IMMEDIATE` uploads a vertex + pixel shader inline; draw /// state must then carry whichever keys the executor minted. With the /// content-hashed key scheme, vs and ps keys differ because their /// microcode bytes differ — the concrete values are derived, so the /// test just asserts both are non-zero and not equal. #[test] fn im_load_records_active_blob_and_draw_carries_it() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // PM4_IM_LOAD_IMMEDIATE VS: 4 data dwords = shader_type + start_size // + 2 code. Header count field = data_count - 1 = 3. let hdr_vs = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8); mem.write_u32(0x4000_0000, hdr_vs); mem.write_u32(0x4000_0004, 0); // shader_type=0 (vertex) mem.write_u32(0x4000_0008, 2); // start_size: size=2 mem.write_u32(0x4000_000C, 0xAAAA_AAAA); mem.write_u32(0x4000_0010, 0xBBBB_BBBB); // Second IM_LOAD_IMMEDIATE PS (shader_type=1); 5 dwords total. let hdr_ps = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8); mem.write_u32(0x4000_0014, hdr_ps); mem.write_u32(0x4000_0018, 1); // shader_type=1 (pixel) mem.write_u32(0x4000_001C, 2); mem.write_u32(0x4000_0020, 0xCCCC_CCCC); mem.write_u32(0x4000_0024, 0xDDDD_DDDD); // DRAW_INDX_2: 1 data dword, count field = 0. let vgt = (3u32 << 16) | (2 << 6) | 4; let hdr_draw = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); mem.write_u32(0x4000_0028, hdr_draw); mem.write_u32(0x4000_002C, vgt); // Total dwords consumed by the 3 packets: 5 + 5 + 2 = 12. gpu.extend_write_ptr(12); // Drain all three packets. for _ in 0..3 { assert!(matches!( gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. } )); } let vs_key = gpu.active_vs_key.expect("vs key set by IM_LOAD_IMMEDIATE"); let ps_key = gpu.active_ps_key.expect("ps key set by IM_LOAD_IMMEDIATE"); assert_ne!(vs_key, ps_key, "VS and PS keys must be distinct"); let ds = gpu.last_draw.expect("DRAW_INDX_2 captured"); assert_eq!(ds.vs_blob_key, Some(vs_key)); assert_eq!(ds.ps_blob_key, Some(ps_key)); } /// Regression: before the content-hash keying, two distinct vertex /// shaders uploaded via `IM_LOAD_IMMEDIATE` both mapped to key `0` /// (the shader_type dword) and overwrote each other in `shader_blobs`. /// With FNV-1a over the microcode, different blobs get different keys /// and the cache retains both. #[test] fn im_load_immediate_distinct_microcode_does_not_collide() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8); // VS shader A. mem.write_u32(0x4000_0000, hdr); mem.write_u32(0x4000_0004, 0); // shader_type = vertex mem.write_u32(0x4000_0008, 2); // size = 2 mem.write_u32(0x4000_000C, 0x1111_1111); mem.write_u32(0x4000_0010, 0x2222_2222); // VS shader B — same stage, different microcode. mem.write_u32(0x4000_0014, hdr); mem.write_u32(0x4000_0018, 0); mem.write_u32(0x4000_001C, 2); mem.write_u32(0x4000_0020, 0x3333_3333); mem.write_u32(0x4000_0024, 0x4444_4444); gpu.extend_write_ptr(10); for _ in 0..2 { assert!(matches!( gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. } )); } assert_eq!( gpu.shader_blobs.len(), 2, "two distinct VS shaders must not collide on the same cache key" ); } /// P4: `EVENT_WRITE` with initiator `15` (TILE_FLUSH) must route /// through the resolve handler — captured `last_resolve` + incremented /// `stats.resolves_total` proves the dispatch works. #[test] fn tile_flush_event_records_resolve() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // Pre-seed RB_COPY_DEST_BASE / DEST_PITCH / DEST_INFO so // ResolveInfo captures recognisable values. gpu.register_file .write(draw_state::reg::RB_COPY_DEST_BASE, 0xDEAD_0000); gpu.register_file.write( draw_state::reg::RB_COPY_DEST_PITCH, (720u32 << 16) | 1280u32, ); // copy_dest_format=6 (k_8_8_8_8), copy_dest_endian=0. gpu.register_file .write(draw_state::reg::RB_COPY_DEST_INFO, 6u32 << 7); gpu.register_file.write( draw_state::reg::RB_COPY_CONTROL, (1u32 << 20) /* copy_command=1 */ | (1u32 << 8), /* color_clear_enable */ ); // PM4_EVENT_WRITE: 1 data dword — the initiator. let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_EVENT_WRITE as u32) << 8); mem.write_u32(0x4000_0000, hdr); mem.write_u32(0x4000_0004, 15); // TILE_FLUSH gpu.extend_write_ptr(2); assert!(matches!( gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. } )); assert_eq!(gpu.stats.resolves_total, 1); let info = gpu.last_resolve.expect("TILE_FLUSH captured resolve"); // `0xDEAD_0000 & 0x1FFF_FFFF = 0x1EAD_0000` — `dest_base` is now // masked to the Xenon 29-bit physical range at decode time. assert_eq!(info.dest_base, 0x1EAD_0000); assert_eq!(info.dest_pitch_pixels, 1280); assert_eq!(info.dest_height_pixels, 720); assert_eq!(info.dest_format, 6); assert_eq!(info.copy_command, 1); assert!(info.color_clear_enable); } /// P4: DRAW_INDX* with a bound color target should populate /// `rt_cache` so downstream stages (HUD, resolve) can look up the RT. #[test] fn draw_indx_populates_rt_cache() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); // color0 enabled + format=2 (k_8_8_8_8 or similar), base=0x10. gpu.register_file.write(draw_state::reg::RB_MODECONTROL, 0x1); gpu.register_file .write(draw_state::reg::RB_COLOR_INFO_0, (2u32 << 16) | 0x10); // Non-zero scissor so pitch32 calc is meaningful. gpu.register_file.write( draw_state::reg::PA_SC_WINDOW_SCISSOR_BR, (720u32 << 16) | 1280u32, ); let vgt = (3u32 << 16) | (2 << 6) | 4; let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8); mem.write_u32(0x4000_0000, hdr); mem.write_u32(0x4000_0004, vgt); gpu.extend_write_ptr(2); assert!(matches!( gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. } )); assert_eq!(gpu.rt_cache.len(), 1); assert_eq!(gpu.stats.unique_render_targets, 1); } #[test] fn xe_swap_records_notification() { let mut gpu = GpuSystem::new(); let mut mem = build_mem(); gpu.initialize_ring_buffer(0x4000_0000, 10); let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_XE_SWAP as u32) << 8); mem.write_u32(0x4000_0000, hdr); mem.write_u32(0x4000_0004, pm4::SWAP_SIGNATURE); mem.write_u32(0x4000_0008, 0xCAFE_0000); mem.write_u32(0x4000_000C, 1280); mem.write_u32(0x4000_0010, 720); gpu.extend_write_ptr(5); assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. })); let swap = gpu.last_swap.unwrap(); assert_eq!(swap.frame_index, 1); assert_eq!(swap.frontbuffer_phys, 0xCAFE_0000); assert_eq!(swap.width, 1280); assert_eq!(swap.height, 720); assert_eq!(gpu.stats.swaps_seen, 1); } }