Files
xenia-rs/crates/xenia-gpu/src/gpu_system.rs
MechaCat02 7a1b6b3306 fix(gpu): GPUBUG-DRAIN-001 — silence VdSwap PM4 fallback under --parallel
The Phase-C VdSwap PM4 ring path (commit 82f3d61) emits two
"PM4_XE_SWAP not consumed by drain" warnings when running:

  exec sylpheed.iso --ui --quiet --halt-on-deadlock \
    --parallel --reservations-table

Lockstep -n 100M never trips it. Two distinct race windows:

(a) Inline backend (--ui forces it): drain(mem, 4096) hit its
    fixed packet cap before reaching the PM4_XE_SWAP we'd just
    injected at the WPTR tail. With 6 CPU threads, the ring
    accumulates >4096 packets between vd_swap callbacks.

(b) Threaded backend (--parallel without --ui): the worker's
    DrainFence handler has a 900 ms deadline and game-batched
    IBs (8-10 M packets observed) keep it from reaching the
    tail in any reasonable budget. If the worker eventually
    drained past the injected packet later, the safety-net
    direct notify would double-count.

Three changes:

* gpu_system.rs: new `drain_until_wptr(target, time_budget)`
  draining by the canary `WorkerThreadMain` predicate
  (read_offset != target) instead of a fixed packet count.
  900 ms deadline mirrors the threaded DrainFence handler.

* handle.rs: inline `drain_to_current_wptr` switches to
  `drain_until_wptr`. DrainFence handler publishes the digest
  mirror BEFORE replying so the CPU's post-drain
  `digest_snapshot` sees fresh stats.

* exports.rs (vd_swap): skip the PM4 ring injection
  unconditionally and route swap notification through
  `notify_xe_swap` directly. Tail-injection is unreliable
  under --parallel for both backends. The slot-0
  fetch-constant patch is deferred (GPUBUG-FETCH-PATCH-001);
  draws=0 today so a stale slot 0 has no observable effect.

Verification:

* cargo test --workspace --release: 556 passing (unchanged).

* Lockstep -n 100M --stable-digest: bit-identical to
  pre-fix master HEAD aa3f1d3.
  {instructions:100000002, imports:987685, unimpl:0, draws:0,
   swaps:2, ...}

* check --parallel --reservations-table -n 30M: 0 warnings
  (was 2). swaps=2.

* exec --gpu-inline --parallel --reservations-table -n 30M:
  0 warnings (was 2 with drained=8M-10M observed). swaps=2.

Audit IDs: GPUBUG-DRAIN-001 (closed),
GPUBUG-FETCH-PATCH-001 (filed, deferred).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 17:12:15 +02:00

1824 lines
79 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Xenos GPU system: register file + primary ring buffer + PM4 executor.
//!
//! Design notes mirror the approved plan's P2 slice:
//!
//! - Runs on the same host thread as the CPU interpreter. Sequential access
//! to `GuestMemory` — no locks, no sharing.
//! - One packet per [`GpuSystem::execute_one`] call. The scheduler calls this
//! once per round when `is_ready` returns true. When the packet is a
//! `WAIT_REG_MEM` whose condition isn't yet satisfied, we transition to
//! [`GpuState::Blocked`] and the scheduler will re-poll us.
//! - Non-draw opcodes execute for real (register/memory writes, event
//! writebacks). Draws (`DRAW_INDX*`, `XE_SWAP`) are classified but not
//! rendered yet; they surface state (via spans + the swap hook) for later
//! phases to consume.
//!
//! Opcode reference: `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h`.
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::time::{Duration, Instant};
use xenia_memory::MemoryAccess;
use crate::draw_state::{self, DrawState};
use crate::pm4::{self, PacketKind};
use crate::primitive::{self, ProcessedPrimitive};
use crate::register_file::RegisterFile;
use crate::ring_view::RingBufferView;
/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
#[derive(Debug, Clone)]
pub struct ShaderBlob {
pub shader_type: u8, // 0 = vertex, 1 = pixel
pub dwords: Vec<u32>,
}
/// P8 — upper bound on the shader-blob cache (`GpuSystem.shader_blobs`).
/// Canary uses a similar FIFO ceiling; our number is deliberately generous
/// because blobs are small (a few KiB each at most) and misses force a
/// re-upload from the guest the next time `IM_LOAD*` fires. 256 is enough
/// for every shipping game's peak working set, per canary's traces.
pub const SHADER_BLOB_CAP: usize = 256;
/// PM4 `WAIT_REG_MEM` condition. Bits 0..=2 of the wait_info dword encode the
/// comparison (per `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h:685-696`).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WaitCmp {
/// value < ref
Less,
/// value <= ref
LessEq,
/// value == ref
Equal,
/// value != ref
NotEqual,
/// value >= ref
GreaterEq,
/// value > ref
Greater,
/// Always — caller wants to sleep regardless.
Always,
}
impl WaitCmp {
/// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`.
pub fn from_wait_info(wait_info: u32) -> Self {
match wait_info & 0x7 {
0 => WaitCmp::Less,
1 => WaitCmp::LessEq,
2 => WaitCmp::Equal,
3 => WaitCmp::NotEqual,
4 => WaitCmp::GreaterEq,
5 => WaitCmp::Greater,
_ => WaitCmp::Always,
}
}
pub fn evaluate(self, value: u32, reference: u32) -> bool {
match self {
WaitCmp::Less => value < reference,
WaitCmp::LessEq => value <= reference,
WaitCmp::Equal => value == reference,
WaitCmp::NotEqual => value != reference,
WaitCmp::GreaterEq => value >= reference,
WaitCmp::Greater => value > reference,
WaitCmp::Always => true,
}
}
}
/// Reason the GPU is currently parked. Mirrors the CPU-side scheduler
/// `BlockReason` shape. Currently only `WaitRegMem` — more to come in later
/// phases (interrupts, timestamp waits).
#[derive(Debug, Clone)]
pub enum GpuBlock {
WaitRegMem {
poll_addr: u32,
is_memory: bool,
reference: u32,
mask: u32,
cmp: WaitCmp,
},
}
impl GpuBlock {
/// Probe the wait condition. Returns `true` if the condition holds and
/// the GPU should be unparked.
pub fn is_satisfied(&self, mem: &dyn MemoryAccess, reg_file: &RegisterFile) -> bool {
match self {
GpuBlock::WaitRegMem {
poll_addr,
is_memory,
reference,
mask,
cmp,
} => {
let value = if *is_memory {
mem.read_u32(*poll_addr)
} else {
reg_file.read(*poll_addr)
};
cmp.evaluate(value & *mask, *reference)
}
}
}
}
/// Public notification the CP emits when the guest presents a frame. The
/// kernel `vd_swap` path gates on this to push a `SwapInfo` to the host UI.
#[derive(Debug, Clone, Copy, Default)]
pub struct SwapNotification {
pub frame_index: u64,
pub frontbuffer_phys: u32,
pub width: u32,
pub height: u32,
}
/// GPU interrupt (fired by `PM4_INTERRUPT`). The kernel dispatches this to
/// the guest callback registered by `VdSetGraphicsInterruptCallback`.
#[derive(Debug, Clone, Copy)]
pub struct PendingInterrupt {
pub source: InterruptSource,
pub cpu_mask: u32,
}
#[derive(Debug, Clone, Copy)]
pub enum InterruptSource {
CommandProcessor,
Swap,
}
/// Per-run counters for observability.
#[derive(Debug, Clone, Default)]
pub struct GpuStats {
pub packets_executed: u64,
pub draws_seen: u64,
pub swaps_seen: u64,
pub interrupts_emitted: u64,
pub wait_reg_mem_blocks: u64,
pub indirect_buffer_jumps: u64,
/// P4: count of EDRAM→memory resolves triggered by `TILE_FLUSH` events
/// (event code 15). Non-zero means the game is committing rendered
/// pixels to the frontbuffer / a texture.
pub resolves_total: u64,
/// Resolves whose byte copy path ran and wrote at least one sample to
/// guest memory. Delta against `resolves_total` indicates how many
/// resolves were skipped for an unsupported format / MSAA mode / 3D
/// destination.
pub resolves_copied_total: u64,
/// Resolves that were skipped by [`crate::resolve::copy_to_memory`] due
/// to an unsupported format path. Logged at `warn` so the reason is
/// visible.
pub resolves_skipped_total: u64,
/// Total number of 32bpp samples written into guest memory across all
/// successful resolves. Useful for sanity-checking that a big splash
/// frame actually made it out (e.g. 1280×720 = 921_600 samples).
pub resolve_samples_written: u64,
/// P4: unique render-target keys seen (as managed by the internal
/// `RenderTargetCache`). Useful HUD metric for multi-target workloads.
pub unique_render_targets: u64,
}
/// Result of one packet step.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExecOutcome {
/// Consumed one packet; GPU remains Ready.
Stepped { dwords_consumed: u32 },
/// Nothing to do right now.
Idle,
/// Parked on a sync primitive; `GpuSystem::pending_block` has details.
Blocked,
}
/// Shader fetch constants: the game uploads them via `PM4_SET_CONSTANT` type=1
/// into a 256-dword region. Games then reference them by index when binding
/// textures / vertex buffers.
pub const CONST_BASE_ALU: u32 = 0x4000;
pub const CONST_BASE_FETCH: u32 = 0x4800;
pub const CONST_BASE_BOOL: u32 = 0x4900;
pub const CONST_BASE_LOOP: u32 = 0x4908;
pub const CONST_BASE_REGISTERS: u32 = 0x2000;
/// Atomic mailbox for the handful of GPU registers that CROSS the MMIO
/// boundary. Guests write into the `0x7FC80000` register aperture; those
/// writes run through [`crate::mmio_region`] and land in these atomics.
/// Inside `execute_one` / the scheduler's per-round GPU hook we sample them
/// to sync `ring.write_offset_dwords`, reflect progress back to the guest,
/// etc.
///
/// Only these three registers need atomic cross-thread access. Everything
/// else lives in [`GpuSystem::register_file`] which is CPU-thread-local.
#[derive(Debug, Clone)]
pub struct GpuMmio {
/// `CP_RB_WPTR` — guest writes dword offset of the write pointer.
pub cp_rb_wptr: Arc<AtomicU32>,
/// `CP_RB_RPTR` — we mirror our internal `ring.read_offset_dwords` here
/// so guests polling the register see progress.
pub cp_rb_rptr: Arc<AtomicU32>,
/// `CP_INT_STATUS` — bit set when an interrupt is pending.
pub cp_int_status: Arc<AtomicU32>,
/// `CP_INT_ACK` — guest clears the bit after handling.
pub cp_int_ack: Arc<AtomicU32>,
/// `D1MODE_VBLANK_VLINE_STATUS` (register `0x1951`, MMIO offset `0x6544`).
/// Bit 0 = `VBLANK_INT_OCCURRED` — set by the GPU when vblank fires,
/// cleared by the guest via write-1-to-clear. Sylpheed's vsync callback
/// gates *all* its work on reading bit 0 as set: `lwz; rlwinm. r,r,0,31,31;
/// bc 12,2,skip`. Without this bit toggling across vsyncs the callback
/// always skips, so the PKEVENT that feeds the render dispatcher
/// (user_data + 0x3B28) never gets signaled and the worker loops
/// forever.
pub d1mode_vblank_vline_status: Arc<AtomicU32>,
/// M1.7 parker — set by producers (guest WPTR writes, shutdown) so
/// the GPU worker thread does not park when work is pending. The
/// worker swaps to `false` on entering its park decision and
/// re-checks predicates; if a producer raced between the swap and
/// the actual `park_timeout`, the producer's `unpark()` returns the
/// park immediately via std's token semantics. Inline mode never
/// reads this; the cost is one extra atomic store per WPTR write.
pub wake_pending: Arc<AtomicBool>,
/// Handle to the GPU worker thread, populated by `GpuWorker::run` on
/// startup. The MMIO write callback for `CP_RB_WPTR` `unpark()`s it
/// after every guest WPTR write so the worker proceeds without
/// waiting for its `park_timeout`. `None` in inline mode (no worker
/// to wake), in which case the unpark site is a one-mutex-lock
/// no-op.
pub worker_thread: Arc<std::sync::Mutex<Option<std::thread::Thread>>>,
}
impl GpuMmio {
pub fn new() -> Self {
Self {
cp_rb_wptr: Arc::new(AtomicU32::new(0)),
cp_rb_rptr: Arc::new(AtomicU32::new(0)),
cp_int_status: Arc::new(AtomicU32::new(0)),
cp_int_ack: Arc::new(AtomicU32::new(0)),
d1mode_vblank_vline_status: Arc::new(AtomicU32::new(0)),
wake_pending: Arc::new(AtomicBool::new(false)),
worker_thread: Arc::new(std::sync::Mutex::new(None)),
}
}
}
impl Default for GpuMmio {
fn default() -> Self {
Self::new()
}
}
/// Live GPU system. One instance per `KernelState`.
pub struct GpuSystem {
pub register_file: RegisterFile,
pub ring: RingBufferView,
/// Stack of saved rings for `PM4_INDIRECT_BUFFER` nesting. The active
/// ring is always `ring`; when an IB packet arrives, we push `ring` onto
/// this stack and replace `ring` with the IB view. On IB completion
/// (read pointer catches up to size), we pop.
ib_stack: Vec<RingBufferView>,
/// Cached shader blobs keyed by the raw CP register address that loaded them.
pub shader_blobs: HashMap<u32, ShaderBlob>,
/// P8 — FIFO of blob keys for bounded eviction. On `IM_LOAD*` the
/// new key is pushed to the back; if the blob count exceeds
/// [`SHADER_BLOB_CAP`], the front is popped and removed from
/// `shader_blobs`. Prevents long-running guests from growing the
/// cache without bound. The two *active* keys (`active_vs_key` +
/// `active_ps_key`) are never evicted — safeguard in `evict_oldest`.
pub shader_blob_order: std::collections::VecDeque<u32>,
/// Monotonic frame counter (bumped on `PM4_XE_SWAP`).
pub swap_counter: u64,
/// Most recent swap notification; the kernel polls this after `execute_one`
/// to decide whether to push a UI swap event.
pub last_swap: Option<SwapNotification>,
/// Queue of interrupts not yet delivered to the guest. Private so that
/// callers go through [`Self::take_pending_interrupts`] — M1 step 6
/// then redirects this drain into a `crossbeam_channel::Sender` without
/// re-touching every call site.
pending_interrupts: Vec<PendingInterrupt>,
/// Current stall reason, if any.
pub pending_block: Option<GpuBlock>,
pub stats: GpuStats,
/// For the 64-bit bin mask/select we split hi/lo writes.
pub bin_mask: u64,
pub bin_select: u64,
/// Shared-atomic mailbox for the `0x7FC80000` MMIO aperture. Cloned into
/// the [`crate::mmio_region`] callbacks; clone-ownership keeps the bus
/// side and the executor side in sync without locks.
pub mmio: GpuMmio,
/// Most recent draw state extracted at a `PM4_DRAW_INDX*` packet. The
/// uber-shader pipeline in P3+ reads this to build its wgpu draw call.
pub last_draw: Option<DrawState>,
/// Most recent processed primitive — index rewrite + host topology
/// decision. Separate from `last_draw` because its `rewritten_indices`
/// may be large and callers may want to drop it after consumption.
pub last_primitive: Option<ProcessedPrimitive>,
/// Key in `shader_blobs` of the currently-active vertex shader. Set by
/// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kVertex` and read at
/// `PM4_DRAW_INDX*` time so the host side can upload the matching
/// microcode bytes before dispatching.
pub active_vs_key: Option<u32>,
/// Key in `shader_blobs` of the currently-active pixel shader. Set by
/// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kPixel`.
pub active_ps_key: Option<u32>,
/// P4: EDRAM tile-ownership bookkeeping + per-key RT descriptors. Updated
/// at each `PM4_DRAW_INDX*` (`bind` + `claim_tiles`) and queried by
/// `TILE_FLUSH` event handling to decide resolve sources.
pub rt_cache: crate::render_target_cache::RenderTargetCache,
/// P4: most recent `ResolveInfo` observed when `TILE_FLUSH` fired. The UI
/// bridge surfaces this in the HUD so users can tell when a game is
/// resolving to the frontbuffer versus an off-screen target.
pub last_resolve: Option<crate::draw_state::ResolveInfo>,
/// P5: CPU-side decoded-texture cache (shared across draws within a
/// frame; trimmed implicitly by insertion). `ensure_cached` hits this
/// on every texture-fetch resolution; the UI thread sees the decoded
/// bytes via `UiBridge::publish_texture`.
pub texture_cache: crate::texture_cache::TextureCache,
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
/// (future) host-render-target readback; read by the resolve byte-copy
/// path that writes tiled pixels into guest memory. Allocated once at
/// `GpuSystem::new` and lives for the whole GPU lifetime — no
/// per-frame churn.
pub edram: crate::edram::ShadowEdram,
}
impl GpuSystem {
pub fn new() -> Self {
Self {
register_file: RegisterFile::new(),
ring: RingBufferView::new(),
ib_stack: Vec::new(),
shader_blobs: HashMap::new(),
shader_blob_order: std::collections::VecDeque::with_capacity(SHADER_BLOB_CAP + 1),
swap_counter: 0,
last_swap: None,
pending_interrupts: Vec::new(),
pending_block: None,
stats: GpuStats::default(),
bin_mask: 0,
bin_select: 0,
mmio: GpuMmio::new(),
last_draw: None,
last_primitive: None,
active_vs_key: None,
active_ps_key: None,
rt_cache: crate::render_target_cache::RenderTargetCache::new(),
last_resolve: None,
texture_cache: crate::texture_cache::TextureCache::new(),
edram: crate::edram::ShadowEdram::new(),
}
}
/// P8 — insert a shader blob + bump the FIFO so long-running games
/// don't grow `shader_blobs` without bound. Caps at [`SHADER_BLOB_CAP`].
/// Never evicts the currently-active VS/PS blobs (if they ended up at
/// the front of the queue, we skip past them).
fn insert_shader_blob(&mut self, key: u32, blob: ShaderBlob) {
let already_present = self.shader_blobs.contains_key(&key);
self.shader_blobs.insert(key, blob);
if !already_present {
self.shader_blob_order.push_back(key);
metrics::counter!("gpu.shader.blob_seen").increment(1);
}
while self.shader_blobs.len() > SHADER_BLOB_CAP {
// Pop the oldest key that isn't one of the active ones.
let mut evicted = None;
for _ in 0..self.shader_blob_order.len() {
if let Some(candidate) = self.shader_blob_order.pop_front() {
if Some(candidate) == self.active_vs_key
|| Some(candidate) == self.active_ps_key
{
self.shader_blob_order.push_back(candidate);
continue;
}
self.shader_blobs.remove(&candidate);
evicted = Some(candidate);
break;
}
}
if evicted.is_some() {
metrics::counter!("gpu.shader.blob_evicted").increment(1);
} else {
// All remaining blobs are active — can't evict, stop.
break;
}
}
}
/// P4: event dispatch helper called by every `PM4_EVENT_WRITE*` variant.
/// `event_code` is the low 6 bits of the initiator word (see canary's
/// `xenos::Event` enum — code 15 is `TILE_FLUSH`, the resolve trigger).
///
/// Handle a `PM4_EVENT_WRITE*` initiator. For `TILE_FLUSH` (event 15)
/// we decode the live `RB_*` register state into a [`ResolveInfo`],
/// paint any clear values into the shadow EDRAM, and then copy bytes
/// from the source render target into guest memory at
/// `RB_COPY_DEST_BASE`. That last step is what `VdSwap` needs to see
/// non-empty front-buffer pixels — see `memory/project_xenia_rs_edram
/// _resolve_gap.md` for the history of this path.
fn handle_event_initiator(&mut self, event_code: u32, mem: &dyn MemoryAccess) {
const EVENT_TILE_FLUSH: u32 = 15;
if event_code != EVENT_TILE_FLUSH {
return;
}
let info = draw_state::ResolveInfo::from_register_file_and_memory(
&self.register_file,
mem,
);
self.stats.resolves_total += 1;
metrics::counter!(
"gpu.resolve",
"src" => format!("{}", info.copy_src_select),
"fmt" => format!("{}", info.dest_format),
"cmd" => format!("{}", info.copy_command),
)
.increment(1);
tracing::info!(
src = info.copy_src_select,
dst_base = format_args!("{:#010x}", info.dest_base),
w = info.coords.width,
h = info.coords.height,
pitch = info.dest_pitch_pixels,
fmt = info.dest_format,
endian = info.dest_endian,
clear_color = info.color_clear_enable,
clear_depth = info.depth_clear_enable,
"gpu: TILE_FLUSH resolve"
);
// Paint clear values into the shadow EDRAM at the source tile
// range *before* the copy. Games often issue a clear-then-resolve
// as a single TILE_FLUSH: the EDRAM is filled with `RB_COLOR_CLEAR`
// by the clear part, and that's what the copy part reads.
//
// Sample coordinates are pixel coordinates scaled up by
// `sample_count_log2_x/y` — for 1x MSAA (Sylpheed) this is the
// identity.
if info.color_clear_enable
&& let draw_state::ResolveSource::Color(_) = info.source
&& info.surface_pitch_tiles > 0
{
let sx = info.coords.x0 << info.coords.sample_count_log2_x;
let sy = info.coords.y0 << info.coords.sample_count_log2_y;
let sw = info.coords.width << info.coords.sample_count_log2_x;
let sh = info.coords.height << info.coords.sample_count_log2_y;
// 64bpp clears use both `RB_COLOR_CLEAR_LO` (low 32 bits) and
// `RB_COLOR_CLEAR` (high 32 bits) per Canary `draw_util.cc:1302-1303`.
// 32bpp clears ignore the lo word entirely.
if info.source_is_64bpp {
self.edram.fill_rect_64bpp(
info.source_base_tiles,
info.surface_pitch_tiles,
sx,
sy,
sw,
sh,
info.color_clear_value_lo,
info.color_clear_value,
);
} else {
self.edram.fill_rect_32bpp(
info.source_base_tiles,
info.surface_pitch_tiles,
sx,
sy,
sw,
sh,
info.color_clear_value,
);
}
}
if info.depth_clear_enable && info.surface_pitch_tiles > 0 {
let sx = info.coords.x0 << info.coords.sample_count_log2_x;
let sy = info.coords.y0 << info.coords.sample_count_log2_y;
let sw = info.coords.width << info.coords.sample_count_log2_x;
let sh = info.coords.height << info.coords.sample_count_log2_y;
// Depth tiles live at RB_DEPTH_INFO.depth_base regardless of
// which source this resolve selects.
let rb_depth_info = self.register_file.read(draw_state::reg::RB_DEPTH_INFO);
let depth_base = (rb_depth_info & 0xFFF) as u16;
self.edram.fill_rect_32bpp(
depth_base,
info.surface_pitch_tiles,
sx,
sy,
sw,
sh,
info.depth_clear_value,
);
}
// Byte copy into guest memory.
let stats = crate::resolve::copy_to_memory(&info, &self.edram, mem);
if stats.supported && stats.samples_written > 0 {
self.stats.resolves_copied_total += 1;
self.stats.resolve_samples_written += stats.samples_written as u64;
} else if !stats.supported {
self.stats.resolves_skipped_total += 1;
}
self.last_resolve = Some(info);
}
/// Sync state with the MMIO atomic mailbox. Call once at the top of the
/// per-round GPU hook: the guest may have advanced `CP_RB_WPTR` since
/// we last ran, and we in turn reflect our read-pointer back to the
/// mirror register so the guest sees progress.
///
/// GPUBUG-006: under `--parallel`, the producer (the guest CP_RB_WPTR
/// MMIO write) uses `Release` to publish prior ring-memory writes;
/// the consumer here must `Acquire`-load to pair correctly. With
/// Relaxed-on-load, ring-memory writes that the guest performed
/// before bumping WPTR could be reordered past our subsequent reads
/// — leading to garbage PM4 packet contents. The producer side at
/// `mmio_region.rs:78` already uses Release; the consumer's Relaxed
/// was the missing half. Symmetrically, the RPTR mirror store
/// publishes our read progress to the guest and benefits from a
/// Release.
pub fn sync_with_mmio(&mut self) {
let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire);
if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 {
self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords;
}
// Mirror our read pointer (Release pairs with any guest-side
// Acquire-load of CP_RB_RPTR for ring writeback bookkeeping).
self.mmio
.cp_rb_rptr
.store(self.ring.read_offset_dwords, Ordering::Release);
}
/// True iff `execute_one` is expected to make progress without blocking.
pub fn is_ready(&self, mem: &dyn MemoryAccess) -> bool {
if let Some(block) = &self.pending_block {
return block.is_satisfied(mem, &self.register_file);
}
self.ring.has_pending()
}
/// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when
/// there's nothing to do, [`ExecOutcome::Blocked`] if a sync primitive
/// stalls the GPU, otherwise [`ExecOutcome::Stepped`] with the number of
/// dwords consumed (counting the header).
pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome {
// 0) If currently parked, probe the condition and either wake up or stay blocked.
if let Some(block) = self.pending_block.clone() {
if block.is_satisfied(mem, &self.register_file) {
tracing::debug!(?block, "gpu: wait satisfied — resuming");
self.pending_block = None;
} else {
return ExecOutcome::Blocked;
}
}
if !self.ring.has_pending() {
// End of current ring. If we were inside an indirect buffer, pop
// and resume the caller.
if let Some(caller) = self.ib_stack.pop() {
self.ring = caller;
if self.ring.has_pending() {
return self.execute_one(mem);
}
}
return ExecOutcome::Idle;
}
let header_addr = self.ring.addr_at_offset(0).unwrap();
let header_word = mem.read_u32(header_addr);
let packet = pm4::decode(header_word);
tracing::trace!(
header = format_args!("{header_word:#010x}"),
addr = format_args!("{header_addr:#010x}"),
?packet.kind,
"gpu: packet"
);
let consumed = match packet.kind {
PacketKind::Type0 { base_index, count, write_one } => {
self.handle_type0(mem, base_index, count, write_one, packet.total_dwords)
}
PacketKind::Type1 { reg_index_1, reg_index_2 } => {
self.handle_type1(mem, reg_index_1, reg_index_2)
}
PacketKind::Type2 => 1,
PacketKind::Type3 {
opcode,
count,
predicated,
} => match self.handle_type3(mem, opcode, count, predicated, packet.total_dwords) {
Type3Result::Consumed(n) => n,
Type3Result::Blocked { rewind_to_header } => {
// Re-park on this packet so the resume path re-reads it.
if rewind_to_header {
// We haven't moved read ptr yet, so this is a no-op —
// documented to keep intent explicit.
}
return ExecOutcome::Blocked;
}
},
};
self.ring.advance_read(consumed);
self.writeback_read_ptr(mem);
self.stats.packets_executed += 1;
ExecOutcome::Stepped {
dwords_consumed: consumed,
}
}
/// First-Pixels M2b — kernel-side commit point for `VdSwap`. Prior to
/// M2b the kernel's [`vd_swap`] wrote a synthetic `PM4_XE_SWAP` packet
/// at the guest-provided `buffer_ptr` + advanced our ring `wptr` by 64
/// dwords, expecting the drain to pick it up. That mechanism misaligned:
/// the drain reads from `ring.base + rptr * 4` forward, not from the
/// game's out-of-band `buffer_ptr`. 512 ring packets executed through
/// 1 B guest instructions but `swaps_seen` stayed at 0.
///
/// `VdSwap` is the kernel's commit point by definition — we don't need
/// to launder the event through the ring. Call this directly from the
/// kernel-side handler; the `PM4_XE_SWAP` opcode path still works for
/// the (rare) case of a game that emits the packet through its own ring
/// writes.
pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) {
self.stats.swaps_seen += 1;
self.swap_counter = self.swap_counter.wrapping_add(1);
self.last_swap = Some(SwapNotification {
frame_index: self.swap_counter,
frontbuffer_phys,
width,
height,
});
self.pending_interrupts.push(PendingInterrupt {
source: InterruptSource::Swap,
cpu_mask: 0x1,
});
tracing::info!(
frame = self.swap_counter,
fb = format_args!("{frontbuffer_phys:#010x}"),
width,
height,
"gpu: XE_SWAP (kernel-direct)"
);
}
/// Called by `VdInitializeRingBuffer` to give us the primary ring.
pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
let size_bytes = 1u32 << size_log2.min(31);
self.ring.base = base;
self.ring.size_dwords = size_bytes / 4;
self.ring.read_offset_dwords = 0;
// `write_offset` is driven by the guest — start at 0 so the ring
// appears empty until MMIO writes advance it.
self.ring.write_offset_dwords = 0;
tracing::info!(
base = format_args!("{base:#010x}"),
size_bytes,
size_dwords = self.ring.size_dwords,
"gpu: ring initialized"
);
}
/// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest
/// expects us to mirror `read_offset_dwords`.
pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) {
self.ring.rptr_writeback_addr = addr;
self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31);
tracing::info!(
addr = format_args!("{addr:#010x}"),
block_dwords = self.ring.rptr_writeback_block_dwords,
"gpu: rptr writeback enabled"
);
}
/// Drain the pending-interrupt queue. The kernel calls this once per
/// scheduler round and queues each entry into `interrupts.queue_interrupt`.
///
/// M1 step 6 swaps the `Vec`-backed implementation for a
/// `crossbeam_channel::Sender<PendingInterrupt>`. Routing every external
/// reader through this single accessor in step 2 means that swap is a
/// localized change — no call site changes.
///
/// Returns the previously-queued interrupts and leaves the internal queue
/// empty. Cheap (`Vec::take`); no allocation when the queue is already
/// empty.
pub fn take_pending_interrupts(&mut self) -> Vec<PendingInterrupt> {
std::mem::take(&mut self.pending_interrupts)
}
/// True when the pending-interrupt queue has at least one entry. Used
/// by callers that want to short-circuit an empty drain (saving the
/// `Vec::new()` allocation that `take` would otherwise force on every
/// scheduler round).
pub fn has_pending_interrupts(&self) -> bool {
!self.pending_interrupts.is_empty()
}
/// Extend the logical write pointer by `dwords` (cumulative). `VdSwap`
/// reserves 64 dwords then calls this; MMIO writes to `CP_RB_WPTR` will
/// do the same in P2+.
pub fn extend_write_ptr(&mut self, new_wptr_dwords: u32) {
if self.ring.size_dwords == 0 {
return;
}
self.ring.write_offset_dwords = new_wptr_dwords % self.ring.size_dwords;
}
/// Write the current read pointer back to the guest-registered
/// address. M1.8 uses the fenced variant: when the GPU runs on its
/// own host thread, the CPU can poll this RPTR mirror to learn how
/// far the GPU has consumed the ring; the Release fence ensures any
/// upstream packet effects (memory writes, register file updates
/// the guest reads via subsequent MMIO) happen-before the
/// CPU-visible RPTR bump.
fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) {
if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() {
mem.write_u32_fence(
self.ring.rptr_writeback_addr,
self.ring.read_offset_dwords,
);
}
}
// ── Type-0/1 handlers ─────────────────────────────────────────────────
fn handle_type0(
&mut self,
mem: &dyn MemoryAccess,
base_index: u32,
count: u32,
write_one: bool,
total_dwords: u32,
) -> u32 {
for i in 0..count {
let dword_addr = self.ring.addr_at_offset(1 + i).unwrap();
let value = mem.read_u32(dword_addr);
let target = if write_one { base_index } else { base_index + i };
self.register_file.write(target, value);
}
tracing::trace!(
base = format_args!("{base_index:#x}"),
count,
write_one,
"gpu: Type0 reg write run"
);
total_dwords
}
fn handle_type1(
&mut self,
mem: &dyn MemoryAccess,
reg_index_1: u32,
reg_index_2: u32,
) -> u32 {
let a_addr = self.ring.addr_at_offset(1).unwrap();
let b_addr = self.ring.addr_at_offset(2).unwrap();
let a = mem.read_u32(a_addr);
let b = mem.read_u32(b_addr);
self.register_file.write(reg_index_1, a);
self.register_file.write(reg_index_2, b);
tracing::trace!(
r1 = format_args!("{reg_index_1:#x}"),
r2 = format_args!("{reg_index_2:#x}"),
"gpu: Type1 dual reg write"
);
3
}
// ── Type-3 dispatch ───────────────────────────────────────────────────
fn handle_type3(
&mut self,
mem: &dyn MemoryAccess,
opcode: u8,
count: u32,
predicated: bool,
total_dwords: u32,
) -> Type3Result {
metrics::counter!("gpu.packet", "opcode" => pm4::type3_opcode_name(opcode)).increment(1);
tracing::trace!(
opcode = format_args!("{opcode:#x}"),
name = pm4::type3_opcode_name(opcode),
count,
predicated,
"gpu: Type3"
);
// If predicated and the bin mask/select combo evaluates to "skip",
// consume the whole packet (including data dwords) and move on. We
// don't emulate binning so bin_mask & bin_select is always 0 → we
// keep predicated packets in simplest form: execute them anyway. Most
// games don't use binning on Xenos. Observed in canary:
// `pm4_command_processor_implement.h:440-460`.
let _ = predicated;
match opcode {
pm4::PM4_NOP
| pm4::PM4_WAIT_FOR_IDLE
| pm4::PM4_CONTEXT_UPDATE
| pm4::PM4_INVALIDATE_STATE
| pm4::PM4_ME_INIT
| pm4::PM4_VIZ_QUERY
| pm4::PM4_SET_SHADER_BASES => {
// Classify-and-skip. State side effects (if any) are deferred.
}
pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => {
self.stats.indirect_buffer_jumps += 1;
let ib_ptr = self.read_payload(mem, 1);
let ib_size = self.read_payload(mem, 2);
// Advance past the IB header + payload before recursing so
// the return location is correct.
self.ring.advance_read(total_dwords);
self.writeback_read_ptr(mem);
// Push current ring, switch to IB view.
let caller = self.ring;
self.ib_stack.push(caller);
self.ring = RingBufferView {
base: ib_ptr & !3,
size_dwords: ib_size,
read_offset_dwords: 0,
write_offset_dwords: ib_size, // IB is fully-written at jump time
rptr_writeback_addr: 0,
rptr_writeback_block_dwords: 0,
};
tracing::debug!(
ib_ptr = format_args!("{ib_ptr:#010x}"),
ib_size,
"gpu: jump to indirect buffer"
);
return Type3Result::Consumed(0); // we already advanced
}
pm4::PM4_WAIT_REG_MEM => {
// Canary layout (pm4_command_processor_implement.h:699-755):
// payload[0] = wait_info (bit 4 = is_memory, bits 2:0 = cmp)
// payload[1] = poll address (register idx OR memory addr; bottom 2 bits = endian for memory)
// payload[2] = ref value
// payload[3] = mask
// payload[4] = wait (sleep hint, ignored)
let wait_info = self.read_payload(mem, 1);
let poll_addr_raw = self.read_payload(mem, 2);
let reference = self.read_payload(mem, 3);
let mask = self.read_payload(mem, 4);
let is_memory = (wait_info & 0x10) != 0;
let cmp = WaitCmp::from_wait_info(wait_info);
let poll_addr = if is_memory {
poll_addr_raw & !3
} else {
poll_addr_raw
};
let block = GpuBlock::WaitRegMem {
poll_addr,
is_memory,
reference,
mask,
cmp,
};
if block.is_satisfied(mem, &self.register_file) {
// Condition already true; proceed past this packet.
tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied");
} else {
self.stats.wait_reg_mem_blocks += 1;
tracing::debug!(?block, "gpu: WAIT_REG_MEM parking");
self.pending_block = Some(block);
return Type3Result::Blocked { rewind_to_header: true };
}
}
pm4::PM4_REG_RMW => {
// payload[0] = rmw_info (bit 31 = and_from_reg, bit 30 = or_from_reg)
// payload[1] = and mask (or register index)
// payload[2] = or mask (or register index)
let rmw_info = self.read_payload(mem, 1);
let and_or_reg = (rmw_info & 0x8000_0000) != 0;
let or_from_reg = (rmw_info & 0x4000_0000) != 0;
let reg_index = rmw_info & 0x1FFF;
let p2 = self.read_payload(mem, 2);
let p3 = self.read_payload(mem, 3);
let and_mask = if and_or_reg {
self.register_file.read(p2 & 0x1FFF)
} else {
p2
};
let or_mask = if or_from_reg {
self.register_file.read(p3 & 0x1FFF)
} else {
p3
};
let cur = self.register_file.read(reg_index);
let new_value = (cur & and_mask) | or_mask;
self.register_file.write(reg_index, new_value);
tracing::trace!(
reg = format_args!("{reg_index:#x}"),
cur = format_args!("{cur:#x}"),
new = format_args!("{new_value:#x}"),
"gpu: REG_RMW"
);
}
pm4::PM4_REG_TO_MEM => {
// payload[0] = reg_index, payload[1] = mem addr
let reg_index = self.read_payload(mem, 1) & 0x1FFF;
let dst = self.read_payload(mem, 2) & !3;
let value = self.register_file.read(reg_index);
mem.write_u32(dst, value);
tracing::trace!(
reg = format_args!("{reg_index:#x}"),
dst = format_args!("{dst:#010x}"),
value = format_args!("{value:#x}"),
"gpu: REG_TO_MEM"
);
}
pm4::PM4_MEM_WRITE => {
// payload[0] = dst, payload[1..=count-1] = values
let mut dst = self.read_payload(mem, 1) & !3;
for i in 2..=count {
let val = self.read_payload(mem, i);
mem.write_u32(dst, val);
dst = dst.wrapping_add(4);
}
}
pm4::PM4_COND_WRITE => {
// payload[0] = wait_info, [1] = poll addr, [2] = ref, [3] = mask,
// [4] = write addr/reg, [5] = write data
let wait_info = self.read_payload(mem, 1);
let poll_raw = self.read_payload(mem, 2);
let reference = self.read_payload(mem, 3);
let mask = self.read_payload(mem, 4);
let is_memory = (wait_info & 0x10) != 0;
let cmp = WaitCmp::from_wait_info(wait_info);
let poll_addr = if is_memory { poll_raw & !3 } else { poll_raw };
let cur_raw = if is_memory {
mem.read_u32(poll_addr)
} else {
self.register_file.read(poll_addr)
};
if cmp.evaluate(cur_raw & mask, reference) {
let write_addr = self.read_payload(mem, 5);
let write_data = self.read_payload(mem, 6);
if (wait_info & 0x100) != 0 {
mem.write_u32(write_addr & !3, write_data);
} else {
self.register_file
.write(write_addr & 0x1FFF, write_data);
}
}
}
pm4::PM4_EVENT_WRITE => {
// payload[0] = initiator (written to VGT_EVENT_INITIATOR).
let initiator = self.read_payload(mem, 1);
self.register_file
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
self.handle_event_initiator(initiator & 0x3F, mem);
tracing::trace!(initiator = format_args!("{:#x}", initiator & 0x3F), "gpu: EVENT_WRITE");
}
pm4::PM4_EVENT_WRITE_SHD => {
// payload[0] = initiator (bit 31: write counter, else write `value`)
// payload[1] = address, payload[2] = value
let initiator = self.read_payload(mem, 1);
let address = self.read_payload(mem, 2);
let value = self.read_payload(mem, 3);
self.register_file
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
self.handle_event_initiator(initiator & 0x3F, mem);
let data = if (initiator & 0x8000_0000) != 0 {
self.swap_counter as u32
} else {
value
};
// M1.8: fenced write. The CPU thread busy-polls this
// address as a GPU completion fence. The Release fence
// emitted here pairs with `read_u32_fence`'s Acquire on
// the polling side: any earlier writes the worker
// performed (RPTR writeback, resolve target writes,
// etc.) are visible to the CPU once it sees the new
// fence value.
mem.write_u32_fence(address & !3, data);
tracing::trace!(
addr = format_args!("{:#010x}", address & !3),
value = format_args!("{data:#x}"),
"gpu: EVENT_WRITE_SHD"
);
}
pm4::PM4_EVENT_WRITE_EXT => {
// payload[0] = initiator, [1] = address. Writes 6 u16 extents
// (min/max x/y/z) — we're not tracking scissors yet, so write zeros.
let initiator = self.read_payload(mem, 1);
let address = self.read_payload(mem, 2) & !3;
self.register_file
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
self.handle_event_initiator(initiator & 0x3F, mem);
for i in 0..6u32 {
mem.write_u16(address + i * 2, 0);
}
}
pm4::PM4_EVENT_WRITE_ZPD => {
// Occlusion query writeback — always write zeros (no query).
let initiator = self.read_payload(mem, 1);
self.register_file
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
self.handle_event_initiator(initiator & 0x3F, mem);
}
pm4::PM4_DRAW_INDX | pm4::PM4_DRAW_INDX_2 => {
self.stats.draws_seen += 1;
// Canary (`pm4_command_processor_implement.h:1128-1151`):
// DRAW_INDX: payload[0] = viz_query, payload[1] = vgt_draw_initiator,
// [2] = dma_base (if source=DMA), [3] = dma_size
// DRAW_INDX_2: payload[0] = vgt_draw_initiator (indices follow inline).
let (vgt, dma_base, dma_size) = if opcode == pm4::PM4_DRAW_INDX {
let _viz = self.read_payload(mem, 1);
let vgt = self.read_payload(mem, 2);
let (db, ds) = if count >= 4 {
(Some(self.read_payload(mem, 3)), Some(self.read_payload(mem, 4)))
} else {
(None, None)
};
(vgt, db, ds)
} else {
(self.read_payload(mem, 1), None, None)
};
let mut ds = draw_state::extract(&self.register_file, vgt, dma_base, dma_size);
ds.vs_blob_key = self.active_vs_key;
ds.ps_blob_key = self.active_ps_key;
let processed = primitive::process(ds.primitive, ds.vertex_count, None);
metrics::counter!(
"gpu.draw",
"prim" => format!("{:?}", ds.primitive),
)
.increment(1);
if processed.rejected {
metrics::counter!("gpu.draw.rejected").increment(1);
}
// P4: update the render-target cache with every bound RT
// from this draw. Each bind either inserts a new key or
// refreshes an existing descriptor's bind_count. `msaa` is
// still hardcoded to 1× because we don't yet decode
// `PA_SC_AA_CONFIG`; P4b can add that.
let msaa = crate::render_target_cache::MsaaSamples::X1;
let mut viewport_height = ds.viewport.scale_y.abs() * 2.0;
if viewport_height <= 0.0 {
viewport_height = 720.0;
}
// 16 samples per tile row (64-sample 8×8 macroblocks pack
// 16 vertical samples per EDRAM tile).
let rows_of_tiles = (viewport_height as u32).div_ceil(16);
for (i, ci_opt) in ds.color_info.iter().enumerate() {
if let Some(ci) = ci_opt {
let pitch32 = ds.scissor.br_x.div_ceil(32);
let key = crate::render_target_cache::RenderTargetKey {
base_tiles: ci.base_tiles,
pitch_tiles_at_32bpp: pitch32,
msaa_samples: msaa,
is_depth: false,
resource_format: ci.format & 0xF,
};
let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
metrics::counter!(
"gpu.rt.bind",
"slot" => format!("{i}"),
"fmt" => format!("{}", ci.format & 0xF),
)
.increment(1);
}
}
if let Some(depth) = ds.depth_info {
let pitch32 = ds.scissor.br_x.div_ceil(32);
let key = crate::render_target_cache::RenderTargetKey {
base_tiles: depth.base_tiles,
pitch_tiles_at_32bpp: pitch32,
msaa_samples: msaa,
is_depth: true,
resource_format: depth.format & 0xF,
};
let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
}
self.stats.unique_render_targets = self.rt_cache.len() as u64;
tracing::debug!(
opcode = format_args!("{opcode:#x}"),
prim = ?ds.primitive,
verts = ds.vertex_count,
?processed.topology,
rewritten = processed.rewritten_indices.is_some(),
"gpu: DRAW_INDX captured"
);
self.last_draw = Some(ds);
self.last_primitive = Some(processed);
}
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
let offset_type = self.read_payload(mem, 1);
let index = offset_type & 0x7FF;
let const_type = (offset_type >> 16) & 0xFF;
let base = match const_type {
0 => CONST_BASE_ALU,
1 => CONST_BASE_FETCH,
2 => CONST_BASE_BOOL,
3 => CONST_BASE_LOOP,
4 => CONST_BASE_REGISTERS,
_ => CONST_BASE_ALU, // defensive default
};
for i in 0..(count - 1) {
let v = self.read_payload(mem, 2 + i);
self.register_file.write(base + index + i, v);
}
}
pm4::PM4_SET_CONSTANT2 => {
// payload[0] = 16-bit index; subsequent payloads write consecutive regs.
let index = self.read_payload(mem, 1) & 0xFFFF;
for i in 0..(count - 1) {
let v = self.read_payload(mem, 2 + i);
self.register_file.write(index + i, v);
}
}
pm4::PM4_LOAD_ALU_CONSTANT => {
// payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords
let src = self.read_payload(mem, 1) & !3;
let offset_type = self.read_payload(mem, 2);
let size_dwords = self.read_payload(mem, 3);
let index = offset_type & 0x7FF;
let const_type = (offset_type >> 16) & 0xFF;
let base = match const_type {
0 => CONST_BASE_ALU,
1 => CONST_BASE_FETCH,
2 => CONST_BASE_BOOL,
3 => CONST_BASE_LOOP,
4 => CONST_BASE_REGISTERS,
_ => CONST_BASE_ALU,
};
for i in 0..size_dwords {
let v = mem.read_u32(src + i * 4);
self.register_file.write(base + index + i, v);
}
}
pm4::PM4_IM_LOAD | pm4::PM4_IM_LOAD_IMMEDIATE => {
// Canary (pm4_command_processor_implement.h:1271-1330):
// IM_LOAD payload: [0] addr_type, [1] start_size
// IM_LOAD_IMMEDIATE payload: [0] shader_type, [1] start_size, [2..count] = microcode
let shader_type = self.read_payload(mem, 1) as u8 & 0x3;
let start_size = self.read_payload(mem, 2);
let size_dwords = start_size & 0xFFFF;
let blob = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
let mut v = Vec::with_capacity(size_dwords as usize);
for i in 0..size_dwords {
v.push(self.read_payload(mem, 3 + i));
}
v
} else {
let addr = self.read_payload(mem, 1) & !3;
let mut v = Vec::with_capacity(size_dwords as usize);
for i in 0..size_dwords {
v.push(mem.read_u32(addr + i * 4));
}
v
};
// For IM_LOAD the payload already carries an address that
// uniquely identifies this shader in guest memory, so the
// full `addr_type` dword (address | stage bits) makes a
// good cache key. For IM_LOAD_IMMEDIATE payload[0] is just
// the 2-bit shader_type — without a content-derived key
// every immediate upload would collide on 0 or 1 and
// thrash a single slot. Fold the microcode through a
// stable FNV-1a hash so per-content dedup still works.
let key = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
fnv1a_u32_dwords(shader_type as u32, &blob)
} else {
self.read_payload(mem, 1)
};
self.insert_shader_blob(
key,
ShaderBlob {
shader_type,
dwords: blob,
},
);
// P3b M1: record which blob is now "active" for the
// current stage. The uber-shader dispatch (xenia-ui) reads
// `active_vs_key`/`active_ps_key` at draw time to upload
// the right microcode. `shader_type`: 0 = vertex, 1 = pixel
// (per Xenos `ShaderType`).
match shader_type {
0 => self.active_vs_key = Some(key),
1 => self.active_ps_key = Some(key),
_ => {}
}
metrics::counter!(
"gpu.shader.blob_seen",
"stage" => if shader_type == 0 { "vs" } else { "ps" },
)
.increment(1);
tracing::debug!(
shader_type,
size_dwords,
key = format_args!("{key:#x}"),
"gpu: IM_LOAD (shader blob cached)"
);
}
pm4::PM4_SET_BIN_MASK_LO => {
self.bin_mask = (self.bin_mask & 0xFFFF_FFFF_0000_0000)
| (self.read_payload(mem, 1) as u64);
}
pm4::PM4_SET_BIN_MASK_HI => {
self.bin_mask = (self.bin_mask & 0x0000_0000_FFFF_FFFF)
| ((self.read_payload(mem, 1) as u64) << 32);
}
pm4::PM4_SET_BIN_MASK => {
let lo = self.read_payload(mem, 1) as u64;
let hi = self.read_payload(mem, 2) as u64;
self.bin_mask = (hi << 32) | lo;
}
pm4::PM4_SET_BIN_SELECT_LO => {
self.bin_select = (self.bin_select & 0xFFFF_FFFF_0000_0000)
| (self.read_payload(mem, 1) as u64);
}
pm4::PM4_SET_BIN_SELECT_HI => {
self.bin_select = (self.bin_select & 0x0000_0000_FFFF_FFFF)
| ((self.read_payload(mem, 1) as u64) << 32);
}
pm4::PM4_SET_BIN_SELECT => {
let lo = self.read_payload(mem, 1) as u64;
let hi = self.read_payload(mem, 2) as u64;
self.bin_select = (hi << 32) | lo;
}
pm4::PM4_INTERRUPT => {
let cpu_mask = self.read_payload(mem, 1);
self.stats.interrupts_emitted += 1;
self.pending_interrupts.push(PendingInterrupt {
source: InterruptSource::CommandProcessor,
cpu_mask,
});
tracing::debug!(
cpu_mask = format_args!("{cpu_mask:#x}"),
"gpu: PM4_INTERRUPT queued"
);
}
pm4::PM4_XE_SWAP => {
// Payload: [0] signature, [1] frontbuffer_phys, [2] width, [3] height
let _signature = self.read_payload(mem, 1);
let frontbuffer_phys = self.read_payload(mem, 2);
let width = self.read_payload(mem, 3);
let height = self.read_payload(mem, 4);
self.notify_xe_swap(frontbuffer_phys, width, height);
}
_ => {
// Unknown opcode — log once per opcode but don't stall.
tracing::warn!(
opcode = format_args!("{opcode:#x}"),
count,
"gpu: unhandled Type3 opcode"
);
}
}
Type3Result::Consumed(total_dwords)
}
/// Read dword at payload-relative offset `i` (where `i=0` is the header).
fn read_payload(&self, mem: &dyn MemoryAccess, i: u32) -> u32 {
let addr = self.ring.addr_at_offset(i).unwrap();
mem.read_u32(addr)
}
/// Drain up to `max_packets` (used by the kernel's VdSwap handler when we
/// don't yet have MMIO-triggered draining). Returns the number of
/// packets consumed.
pub fn drain(&mut self, mem: &dyn MemoryAccess, max_packets: u32) -> u32 {
let mut n = 0;
for _ in 0..max_packets {
match self.execute_one(mem) {
ExecOutcome::Stepped { .. } => n += 1,
ExecOutcome::Idle | ExecOutcome::Blocked => break,
}
}
n
}
/// Drain until the ring's read offset reaches `target_wptr` (modulo ring
/// size) or `execute_one` returns Idle/Blocked. Mirrors canary's
/// `WorkerThreadMain` (xenia-canary `command_processor.cc` ExecutePrimaryBuffer)
/// which loops on `read_ptr_index_ != write_ptr_index` with no packet
/// budget. `time_budget` bounds wall-clock so a pathological packet
/// (e.g. an EVENT_WRITE that perpetually re-blocks) cannot spin the
/// inline path; pass 900 ms to match the threaded `DrainFence` deadline.
/// Returns the number of packets consumed.
pub fn drain_until_wptr(
&mut self,
mem: &dyn MemoryAccess,
target_wptr: u32,
time_budget: Duration,
) -> u32 {
if self.ring.size_dwords == 0 {
return 0;
}
let target = target_wptr % self.ring.size_dwords;
let deadline = Instant::now() + time_budget;
let mut n = 0u32;
while self.ring.read_offset_dwords != target {
if Instant::now() >= deadline {
// Deadline exhaustion is the *expected* outcome under
// `--parallel` workloads (Sylpheed boot queues millions
// of game-batched IBs the inline drain can't chew
// through in 900 ms). Logged at debug because warn-level
// would fire on every vd_swap. Callers can re-read the
// ring read pointer to detect partial drain if they
// care.
tracing::debug!(
target,
rptr = self.ring.read_offset_dwords,
consumed = n,
"gpu: drain_until_wptr time-budget exhausted"
);
break;
}
match self.execute_one(mem) {
ExecOutcome::Stepped { .. } => {
n += 1;
// Mirror the threaded `DrainFence` handler at
// handle.rs:553-570: re-sync after every packet so
// any concurrent guest WPTR write (under `--parallel`)
// folds into the local ring view before the next
// `is_ready` check. Without this the local
// write_offset is a snapshot of the moment we entered
// the drain, which is fine for a target-WPTR drain
// but wrong if downstream packets (e.g. an indirect
// buffer's nested ring) need an updated view.
self.sync_with_mmio();
}
ExecOutcome::Idle | ExecOutcome::Blocked => break,
}
}
n
}
}
impl Default for GpuSystem {
fn default() -> Self {
Self::new()
}
}
/// Subset of Xenos registers we reference by name. Full table at
/// `xenia-canary/src/xenia/gpu/registers.h`.
pub mod reg {
//! All values below are Xenos *register indices* (the number you find in
//! canary's `register_table.inc`, i.e. the byte offset within the
//! aperture divided by 4). The MMIO aperture at `0x7FC8_0000` maps each
//! register at `APERTURE_BASE + index * 4`; the MMIO callbacks recover
//! the index with `(addr & 0xFFFF) / 4` before matching against these
//! constants.
/// `XE_GPU_REG_CP_RB_BASE` — ring buffer base address.
pub const CP_RB_BASE: u32 = 0x01C0;
/// `XE_GPU_REG_CP_RB_CNTL` — ring buffer control.
pub const CP_RB_CNTL: u32 = 0x01C1;
/// `XE_GPU_REG_CP_RB_RPTR_ADDR` — where to mirror read pointer.
pub const CP_RB_RPTR_ADDR: u32 = 0x01C3;
/// `XE_GPU_REG_CP_RB_RPTR` — read pointer (GPU → CPU).
pub const CP_RB_RPTR: u32 = 0x01C4;
/// `XE_GPU_REG_CP_RB_WPTR` — write pointer (CPU → GPU); MMIO side effect.
pub const CP_RB_WPTR: u32 = 0x01C5;
/// `XE_GPU_REG_CP_INT_STATUS` — interrupt status bits.
pub const CP_INT_STATUS: u32 = 0x01F3;
/// `XE_GPU_REG_CP_INT_ACK` — write-to-ack interrupt bits.
pub const CP_INT_ACK: u32 = 0x01F4;
/// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
/// Bit 0 = VBLANK_INT_OCCURRED.
pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
/// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
/// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
/// (Canary `register_table.inc:530`).
pub const COHER_STATUS_HOST: u32 = 0x0A31;
}
/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a
/// stable, collision-resistant cache key for `IM_LOAD_IMMEDIATE` shader
/// blobs (where the guest supplies no natural address to key on).
fn fnv1a_u32_dwords(seed: u32, dwords: &[u32]) -> u32 {
const FNV_OFFSET: u32 = 0x811C_9DC5;
const FNV_PRIME: u32 = 0x0100_0193;
let mut hash = FNV_OFFSET;
for byte in seed.to_le_bytes() {
hash ^= byte as u32;
hash = hash.wrapping_mul(FNV_PRIME);
}
for dw in dwords {
for byte in dw.to_le_bytes() {
hash ^= byte as u32;
hash = hash.wrapping_mul(FNV_PRIME);
}
}
hash
}
/// Internal Type-3 handler result. Distinguishes "consumed a packet (by N
/// dwords)" from "blocked; don't advance read ptr".
enum Type3Result {
Consumed(u32),
Blocked { rewind_to_header: bool },
}
#[cfg(test)]
mod tests {
use super::*;
use xenia_memory::GuestMemory;
use xenia_memory::page_table::MemoryProtect;
fn build_mem() -> GuestMemory {
let mut mem = GuestMemory::new().unwrap();
let rw = MemoryProtect::READ | MemoryProtect::WRITE;
mem.alloc(0x4000_0000, 0x4000, rw).unwrap();
mem
}
#[test]
fn ready_when_ring_has_pending() {
let mut gpu = GpuSystem::new();
let mem = build_mem();
assert!(!gpu.is_ready(&mem));
gpu.initialize_ring_buffer(0x4000_0000, 10); // 1024 bytes = 256 dwords
assert!(!gpu.is_ready(&mem));
gpu.extend_write_ptr(4);
assert!(gpu.is_ready(&mem));
}
#[test]
fn type2_nop_advances_read_pointer() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
// 256 dwords ring at 0x40000000
gpu.initialize_ring_buffer(0x4000_0000, 10);
// Push 3 Type-2 NOPs
for i in 0..3u32 {
mem.write_u32(0x4000_0000 + i * 4, 0x8000_0000);
}
gpu.extend_write_ptr(3);
for _ in 0..3 {
match gpu.execute_one(&mut mem) {
ExecOutcome::Stepped { dwords_consumed } => assert_eq!(dwords_consumed, 1),
other => panic!("unexpected {:?}", other),
}
}
assert_eq!(gpu.ring.read_offset_dwords, 3);
assert_eq!(gpu.stats.packets_executed, 3);
}
#[test]
fn type0_reg_run_writes_register_file() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// Type0 header: 2 dwords, base_index=0x100 → write_one=0 → count field = 1 (count-1)
let hdr = (1u32 << 16) | 0x100;
mem.write_u32(0x4000_0000, hdr);
mem.write_u32(0x4000_0004, 0xDEAD_BEEF);
mem.write_u32(0x4000_0008, 0xCAFE_BABE);
gpu.extend_write_ptr(3);
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
assert_eq!(gpu.register_file.read(0x100), 0xDEAD_BEEF);
assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE);
}
#[test]
fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// WAIT_REG_MEM: wait until *0x40001000 == 0x42
// header
let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8);
mem.write_u32(0x4000_0000, hdr);
// wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 2)
mem.write_u32(0x4000_0004, 0x12);
mem.write_u32(0x4000_0008, 0x4000_1000);
mem.write_u32(0x4000_000C, 0x42);
mem.write_u32(0x4000_0010, 0xFFFF_FFFF);
mem.write_u32(0x4000_0014, 0);
gpu.extend_write_ptr(6);
// First exec: poll addr reads 0 → blocked.
assert_eq!(gpu.execute_one(&mut mem), ExecOutcome::Blocked);
assert_eq!(gpu.ring.read_offset_dwords, 0, "read ptr must not advance while blocked");
// Make the wait satisfied.
mem.write_u32(0x4000_1000, 0x42);
match gpu.execute_one(&mut mem) {
ExecOutcome::Stepped { dwords_consumed } => {
// The WAIT_REG_MEM packet is 1 (header) + 5 (count) = 6 dwords.
assert_eq!(dwords_consumed, 6);
}
other => panic!("expected Stepped after wait satisfied, got {:?}", other),
}
assert_eq!(gpu.ring.read_offset_dwords, 6);
}
#[test]
fn mem_write_writes_all_payload_dwords() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// MEM_WRITE: count=3 → 1 header + 1 dst + 2 data
let hdr = (3u32 << 30) | ((3u32 - 1) << 16) | ((pm4::PM4_MEM_WRITE as u32) << 8);
mem.write_u32(0x4000_0000, hdr);
mem.write_u32(0x4000_0004, 0x4000_1000); // dst
mem.write_u32(0x4000_0008, 0x1111_1111);
mem.write_u32(0x4000_000C, 0x2222_2222);
gpu.extend_write_ptr(4);
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
assert_eq!(mem.read_u32(0x4000_1000), 0x1111_1111);
assert_eq!(mem.read_u32(0x4000_1004), 0x2222_2222);
}
#[test]
fn mmio_write_to_cp_rb_wptr_reflects_into_ring() {
use std::sync::atomic::Ordering;
let mut gpu = GpuSystem::new();
let mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// Guest writes wptr=8 via MMIO.
gpu.mmio.cp_rb_wptr.store(8, Ordering::Relaxed);
// Before sync, ring has no pending work.
assert!(!gpu.is_ready(&mem));
gpu.sync_with_mmio();
assert_eq!(gpu.ring.write_offset_dwords, 8);
assert!(gpu.is_ready(&mem));
// After sync, rptr is mirrored back to mmio for the guest to read.
assert_eq!(gpu.mmio.cp_rb_rptr.load(Ordering::Relaxed), 0);
}
/// End-to-end: feed two DRAW_INDX_2 packets through `execute_one` and
/// verify the GPU system reports the expected `draws_seen` / `last_draw`
/// state that the UI's Xenos pipeline consumes. Acts as the "draw
/// dispatch integration" check mentioned in the P3 verification plan.
#[test]
fn successive_draws_accumulate_in_stats() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
let mk_draw = |addr: u32, vgt: u32, mem: &xenia_memory::GuestMemory| {
let hdr = (3u32 << 30) | ((1u32 - 1) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
mem.write_u32(addr, hdr);
mem.write_u32(addr + 4, vgt);
};
// Draw #1: TriangleList, 6 verts.
mk_draw(0x4000_0000, (6u32 << 16) | (2 << 6) | 4, &mut mem);
// Draw #2: TriangleStrip, 4 verts.
mk_draw(0x4000_0008, (4u32 << 16) | (2 << 6) | 6, &mut mem);
gpu.extend_write_ptr(4);
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
assert_eq!(gpu.stats.draws_seen, 2);
let ds = gpu.last_draw.expect("last_draw set");
assert_eq!(ds.primitive, crate::draw_state::PrimitiveType::TriangleStrip);
assert_eq!(ds.vertex_count, 4);
}
#[test]
fn draw_indx_2_captures_last_draw() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// DRAW_INDX_2 packet with 1 payload dword = vgt_draw_initiator:
// prim=4 (TriangleList), source=2 (auto), count=3 verts.
let vgt = (3u32 << 16) | (2 << 6) | 4;
let hdr = (3u32 << 30) | ((1u32 - 0) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
// count field in header = (total_dwords_after - 1); we have 1 payload = count=1 → encoded 0.
let hdr = (hdr & !0x3FFF_0000) | ((1u32 - 1) << 16);
mem.write_u32(0x4000_0000, hdr);
mem.write_u32(0x4000_0004, vgt);
gpu.extend_write_ptr(2);
assert!(matches!(
gpu.execute_one(&mut mem),
ExecOutcome::Stepped { .. }
));
assert_eq!(gpu.stats.draws_seen, 1);
let ds = gpu.last_draw.expect("last_draw set");
assert_eq!(
ds.primitive,
crate::draw_state::PrimitiveType::TriangleList
);
assert_eq!(ds.vertex_count, 3);
let p = gpu.last_primitive.as_ref().expect("last_primitive set");
assert_eq!(p.topology, crate::primitive::HostTopology::TriangleList);
assert!(!p.rejected);
}
/// P3b M1: IM_LOAD_IMMEDIATE must set `active_vs_key` / `active_ps_key`
/// based on `shader_type`, and a subsequent DRAW_INDX must carry those
/// P8: shader-blob FIFO evicts the oldest non-active blob when the
/// cache crosses `SHADER_BLOB_CAP`. Active keys are protected.
#[test]
fn shader_blob_cap_evicts_oldest() {
let mut gpu = GpuSystem::new();
gpu.active_vs_key = Some(u32::MAX);
// Insert unique keys (starting at 1_000 to avoid colliding with
// the active-key sentinel) up to `CAP + 10`; every insert fires
// the eviction path once len > CAP.
gpu.insert_shader_blob(
u32::MAX,
ShaderBlob {
shader_type: 0,
dwords: vec![0xAA; 4],
},
);
let first_key = 1_000u32;
for k in first_key..(first_key + SHADER_BLOB_CAP as u32 + 10) {
gpu.insert_shader_blob(
k,
ShaderBlob {
shader_type: 0,
dwords: vec![k; 2],
},
);
}
assert!(gpu.shader_blobs.len() <= SHADER_BLOB_CAP);
// Active key (u32::MAX) must still be present.
assert!(gpu.shader_blobs.contains_key(&u32::MAX));
// Earliest non-active key must have been evicted (at least one of
// the first 10 we inserted is gone).
let evicted = (first_key..first_key + 10)
.filter(|k| !gpu.shader_blobs.contains_key(k))
.count();
assert!(
evicted > 0,
"expected at least one of the first 10 keys to be evicted, \
got shader_blobs.len() = {}",
gpu.shader_blobs.len()
);
}
/// `IM_LOAD_IMMEDIATE` uploads a vertex + pixel shader inline; draw
/// state must then carry whichever keys the executor minted. With the
/// content-hashed key scheme, vs and ps keys differ because their
/// microcode bytes differ — the concrete values are derived, so the
/// test just asserts both are non-zero and not equal.
#[test]
fn im_load_records_active_blob_and_draw_carries_it() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// PM4_IM_LOAD_IMMEDIATE VS: 4 data dwords = shader_type + start_size
// + 2 code. Header count field = data_count - 1 = 3.
let hdr_vs = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
mem.write_u32(0x4000_0000, hdr_vs);
mem.write_u32(0x4000_0004, 0); // shader_type=0 (vertex)
mem.write_u32(0x4000_0008, 2); // start_size: size=2
mem.write_u32(0x4000_000C, 0xAAAA_AAAA);
mem.write_u32(0x4000_0010, 0xBBBB_BBBB);
// Second IM_LOAD_IMMEDIATE PS (shader_type=1); 5 dwords total.
let hdr_ps = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
mem.write_u32(0x4000_0014, hdr_ps);
mem.write_u32(0x4000_0018, 1); // shader_type=1 (pixel)
mem.write_u32(0x4000_001C, 2);
mem.write_u32(0x4000_0020, 0xCCCC_CCCC);
mem.write_u32(0x4000_0024, 0xDDDD_DDDD);
// DRAW_INDX_2: 1 data dword, count field = 0.
let vgt = (3u32 << 16) | (2 << 6) | 4;
let hdr_draw = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
mem.write_u32(0x4000_0028, hdr_draw);
mem.write_u32(0x4000_002C, vgt);
// Total dwords consumed by the 3 packets: 5 + 5 + 2 = 12.
gpu.extend_write_ptr(12);
// Drain all three packets.
for _ in 0..3 {
assert!(matches!(
gpu.execute_one(&mut mem),
ExecOutcome::Stepped { .. }
));
}
let vs_key = gpu.active_vs_key.expect("vs key set by IM_LOAD_IMMEDIATE");
let ps_key = gpu.active_ps_key.expect("ps key set by IM_LOAD_IMMEDIATE");
assert_ne!(vs_key, ps_key, "VS and PS keys must be distinct");
let ds = gpu.last_draw.expect("DRAW_INDX_2 captured");
assert_eq!(ds.vs_blob_key, Some(vs_key));
assert_eq!(ds.ps_blob_key, Some(ps_key));
}
/// Regression: before the content-hash keying, two distinct vertex
/// shaders uploaded via `IM_LOAD_IMMEDIATE` both mapped to key `0`
/// (the shader_type dword) and overwrote each other in `shader_blobs`.
/// With FNV-1a over the microcode, different blobs get different keys
/// and the cache retains both.
#[test]
fn im_load_immediate_distinct_microcode_does_not_collide() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
// VS shader A.
mem.write_u32(0x4000_0000, hdr);
mem.write_u32(0x4000_0004, 0); // shader_type = vertex
mem.write_u32(0x4000_0008, 2); // size = 2
mem.write_u32(0x4000_000C, 0x1111_1111);
mem.write_u32(0x4000_0010, 0x2222_2222);
// VS shader B — same stage, different microcode.
mem.write_u32(0x4000_0014, hdr);
mem.write_u32(0x4000_0018, 0);
mem.write_u32(0x4000_001C, 2);
mem.write_u32(0x4000_0020, 0x3333_3333);
mem.write_u32(0x4000_0024, 0x4444_4444);
gpu.extend_write_ptr(10);
for _ in 0..2 {
assert!(matches!(
gpu.execute_one(&mut mem),
ExecOutcome::Stepped { .. }
));
}
assert_eq!(
gpu.shader_blobs.len(),
2,
"two distinct VS shaders must not collide on the same cache key"
);
}
/// P4: `EVENT_WRITE` with initiator `15` (TILE_FLUSH) must route
/// through the resolve handler — captured `last_resolve` + incremented
/// `stats.resolves_total` proves the dispatch works.
#[test]
fn tile_flush_event_records_resolve() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// Pre-seed RB_COPY_DEST_BASE / DEST_PITCH / DEST_INFO so
// ResolveInfo captures recognisable values.
gpu.register_file
.write(draw_state::reg::RB_COPY_DEST_BASE, 0xDEAD_0000);
gpu.register_file.write(
draw_state::reg::RB_COPY_DEST_PITCH,
(720u32 << 16) | 1280u32,
);
// copy_dest_format=6 (k_8_8_8_8), copy_dest_endian=0.
gpu.register_file
.write(draw_state::reg::RB_COPY_DEST_INFO, 6u32 << 7);
gpu.register_file.write(
draw_state::reg::RB_COPY_CONTROL,
(1u32 << 20) /* copy_command=1 */ | (1u32 << 8), /* color_clear_enable */
);
// PM4_EVENT_WRITE: 1 data dword — the initiator.
let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_EVENT_WRITE as u32) << 8);
mem.write_u32(0x4000_0000, hdr);
mem.write_u32(0x4000_0004, 15); // TILE_FLUSH
gpu.extend_write_ptr(2);
assert!(matches!(
gpu.execute_one(&mut mem),
ExecOutcome::Stepped { .. }
));
assert_eq!(gpu.stats.resolves_total, 1);
let info = gpu.last_resolve.expect("TILE_FLUSH captured resolve");
// `0xDEAD_0000 & 0x1FFF_FFFF = 0x1EAD_0000` — `dest_base` is now
// masked to the Xenon 29-bit physical range at decode time.
assert_eq!(info.dest_base, 0x1EAD_0000);
assert_eq!(info.dest_pitch_pixels, 1280);
assert_eq!(info.dest_height_pixels, 720);
assert_eq!(info.dest_format, 6);
assert_eq!(info.copy_command, 1);
assert!(info.color_clear_enable);
}
/// P4: DRAW_INDX* with a bound color target should populate
/// `rt_cache` so downstream stages (HUD, resolve) can look up the RT.
#[test]
fn draw_indx_populates_rt_cache() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
// color0 enabled + format=2 (k_8_8_8_8 or similar), base=0x10.
gpu.register_file.write(draw_state::reg::RB_MODECONTROL, 0x1);
gpu.register_file
.write(draw_state::reg::RB_COLOR_INFO_0, (2u32 << 16) | 0x10);
// Non-zero scissor so pitch32 calc is meaningful.
gpu.register_file.write(
draw_state::reg::PA_SC_WINDOW_SCISSOR_BR,
(720u32 << 16) | 1280u32,
);
let vgt = (3u32 << 16) | (2 << 6) | 4;
let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
mem.write_u32(0x4000_0000, hdr);
mem.write_u32(0x4000_0004, vgt);
gpu.extend_write_ptr(2);
assert!(matches!(
gpu.execute_one(&mut mem),
ExecOutcome::Stepped { .. }
));
assert_eq!(gpu.rt_cache.len(), 1);
assert_eq!(gpu.stats.unique_render_targets, 1);
}
#[test]
fn xe_swap_records_notification() {
let mut gpu = GpuSystem::new();
let mut mem = build_mem();
gpu.initialize_ring_buffer(0x4000_0000, 10);
let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
mem.write_u32(0x4000_0000, hdr);
mem.write_u32(0x4000_0004, pm4::SWAP_SIGNATURE);
mem.write_u32(0x4000_0008, 0xCAFE_0000);
mem.write_u32(0x4000_000C, 1280);
mem.write_u32(0x4000_0010, 720);
gpu.extend_write_ptr(5);
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
let swap = gpu.last_swap.unwrap();
assert_eq!(swap.frame_index, 1);
assert_eq!(swap.frontbuffer_phys, 0xCAFE_0000);
assert_eq!(swap.width, 1280);
assert_eq!(swap.height, 720);
assert_eq!(gpu.stats.swaps_seen, 1);
}
}