The Phase-C VdSwap PM4 ring path (commit82f3d61) emits two "PM4_XE_SWAP not consumed by drain" warnings when running: exec sylpheed.iso --ui --quiet --halt-on-deadlock \ --parallel --reservations-table Lockstep -n 100M never trips it. Two distinct race windows: (a) Inline backend (--ui forces it): drain(mem, 4096) hit its fixed packet cap before reaching the PM4_XE_SWAP we'd just injected at the WPTR tail. With 6 CPU threads, the ring accumulates >4096 packets between vd_swap callbacks. (b) Threaded backend (--parallel without --ui): the worker's DrainFence handler has a 900 ms deadline and game-batched IBs (8-10 M packets observed) keep it from reaching the tail in any reasonable budget. If the worker eventually drained past the injected packet later, the safety-net direct notify would double-count. Three changes: * gpu_system.rs: new `drain_until_wptr(target, time_budget)` draining by the canary `WorkerThreadMain` predicate (read_offset != target) instead of a fixed packet count. 900 ms deadline mirrors the threaded DrainFence handler. * handle.rs: inline `drain_to_current_wptr` switches to `drain_until_wptr`. DrainFence handler publishes the digest mirror BEFORE replying so the CPU's post-drain `digest_snapshot` sees fresh stats. * exports.rs (vd_swap): skip the PM4 ring injection unconditionally and route swap notification through `notify_xe_swap` directly. Tail-injection is unreliable under --parallel for both backends. The slot-0 fetch-constant patch is deferred (GPUBUG-FETCH-PATCH-001); draws=0 today so a stale slot 0 has no observable effect. Verification: * cargo test --workspace --release: 556 passing (unchanged). * Lockstep -n 100M --stable-digest: bit-identical to pre-fix master HEADaa3f1d3. {instructions:100000002, imports:987685, unimpl:0, draws:0, swaps:2, ...} * check --parallel --reservations-table -n 30M: 0 warnings (was 2). swaps=2. * exec --gpu-inline --parallel --reservations-table -n 30M: 0 warnings (was 2 with drained=8M-10M observed). swaps=2. Audit IDs: GPUBUG-DRAIN-001 (closed), GPUBUG-FETCH-PATCH-001 (filed, deferred). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1824 lines
79 KiB
Rust
1824 lines
79 KiB
Rust
//! Xenos GPU system: register file + primary ring buffer + PM4 executor.
|
||
//!
|
||
//! Design notes mirror the approved plan's P2 slice:
|
||
//!
|
||
//! - Runs on the same host thread as the CPU interpreter. Sequential access
|
||
//! to `GuestMemory` — no locks, no sharing.
|
||
//! - One packet per [`GpuSystem::execute_one`] call. The scheduler calls this
|
||
//! once per round when `is_ready` returns true. When the packet is a
|
||
//! `WAIT_REG_MEM` whose condition isn't yet satisfied, we transition to
|
||
//! [`GpuState::Blocked`] and the scheduler will re-poll us.
|
||
//! - Non-draw opcodes execute for real (register/memory writes, event
|
||
//! writebacks). Draws (`DRAW_INDX*`, `XE_SWAP`) are classified but not
|
||
//! rendered yet; they surface state (via spans + the swap hook) for later
|
||
//! phases to consume.
|
||
//!
|
||
//! Opcode reference: `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h`.
|
||
|
||
use std::collections::HashMap;
|
||
use std::sync::Arc;
|
||
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
|
||
use std::time::{Duration, Instant};
|
||
|
||
use xenia_memory::MemoryAccess;
|
||
|
||
use crate::draw_state::{self, DrawState};
|
||
use crate::pm4::{self, PacketKind};
|
||
use crate::primitive::{self, ProcessedPrimitive};
|
||
use crate::register_file::RegisterFile;
|
||
use crate::ring_view::RingBufferView;
|
||
|
||
/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
|
||
#[derive(Debug, Clone)]
|
||
pub struct ShaderBlob {
|
||
pub shader_type: u8, // 0 = vertex, 1 = pixel
|
||
pub dwords: Vec<u32>,
|
||
}
|
||
|
||
/// P8 — upper bound on the shader-blob cache (`GpuSystem.shader_blobs`).
|
||
/// Canary uses a similar FIFO ceiling; our number is deliberately generous
|
||
/// because blobs are small (a few KiB each at most) and misses force a
|
||
/// re-upload from the guest the next time `IM_LOAD*` fires. 256 is enough
|
||
/// for every shipping game's peak working set, per canary's traces.
|
||
pub const SHADER_BLOB_CAP: usize = 256;
|
||
|
||
/// PM4 `WAIT_REG_MEM` condition. Bits 0..=2 of the wait_info dword encode the
|
||
/// comparison (per `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h:685-696`).
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||
pub enum WaitCmp {
|
||
/// value < ref
|
||
Less,
|
||
/// value <= ref
|
||
LessEq,
|
||
/// value == ref
|
||
Equal,
|
||
/// value != ref
|
||
NotEqual,
|
||
/// value >= ref
|
||
GreaterEq,
|
||
/// value > ref
|
||
Greater,
|
||
/// Always — caller wants to sleep regardless.
|
||
Always,
|
||
}
|
||
|
||
impl WaitCmp {
|
||
/// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`.
|
||
pub fn from_wait_info(wait_info: u32) -> Self {
|
||
match wait_info & 0x7 {
|
||
0 => WaitCmp::Less,
|
||
1 => WaitCmp::LessEq,
|
||
2 => WaitCmp::Equal,
|
||
3 => WaitCmp::NotEqual,
|
||
4 => WaitCmp::GreaterEq,
|
||
5 => WaitCmp::Greater,
|
||
_ => WaitCmp::Always,
|
||
}
|
||
}
|
||
|
||
pub fn evaluate(self, value: u32, reference: u32) -> bool {
|
||
match self {
|
||
WaitCmp::Less => value < reference,
|
||
WaitCmp::LessEq => value <= reference,
|
||
WaitCmp::Equal => value == reference,
|
||
WaitCmp::NotEqual => value != reference,
|
||
WaitCmp::GreaterEq => value >= reference,
|
||
WaitCmp::Greater => value > reference,
|
||
WaitCmp::Always => true,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Reason the GPU is currently parked. Mirrors the CPU-side scheduler
|
||
/// `BlockReason` shape. Currently only `WaitRegMem` — more to come in later
|
||
/// phases (interrupts, timestamp waits).
|
||
#[derive(Debug, Clone)]
|
||
pub enum GpuBlock {
|
||
WaitRegMem {
|
||
poll_addr: u32,
|
||
is_memory: bool,
|
||
reference: u32,
|
||
mask: u32,
|
||
cmp: WaitCmp,
|
||
},
|
||
}
|
||
|
||
impl GpuBlock {
|
||
/// Probe the wait condition. Returns `true` if the condition holds and
|
||
/// the GPU should be unparked.
|
||
pub fn is_satisfied(&self, mem: &dyn MemoryAccess, reg_file: &RegisterFile) -> bool {
|
||
match self {
|
||
GpuBlock::WaitRegMem {
|
||
poll_addr,
|
||
is_memory,
|
||
reference,
|
||
mask,
|
||
cmp,
|
||
} => {
|
||
let value = if *is_memory {
|
||
mem.read_u32(*poll_addr)
|
||
} else {
|
||
reg_file.read(*poll_addr)
|
||
};
|
||
cmp.evaluate(value & *mask, *reference)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Public notification the CP emits when the guest presents a frame. The
|
||
/// kernel `vd_swap` path gates on this to push a `SwapInfo` to the host UI.
|
||
#[derive(Debug, Clone, Copy, Default)]
|
||
pub struct SwapNotification {
|
||
pub frame_index: u64,
|
||
pub frontbuffer_phys: u32,
|
||
pub width: u32,
|
||
pub height: u32,
|
||
}
|
||
|
||
/// GPU interrupt (fired by `PM4_INTERRUPT`). The kernel dispatches this to
|
||
/// the guest callback registered by `VdSetGraphicsInterruptCallback`.
|
||
#[derive(Debug, Clone, Copy)]
|
||
pub struct PendingInterrupt {
|
||
pub source: InterruptSource,
|
||
pub cpu_mask: u32,
|
||
}
|
||
|
||
#[derive(Debug, Clone, Copy)]
|
||
pub enum InterruptSource {
|
||
CommandProcessor,
|
||
Swap,
|
||
}
|
||
|
||
/// Per-run counters for observability.
|
||
#[derive(Debug, Clone, Default)]
|
||
pub struct GpuStats {
|
||
pub packets_executed: u64,
|
||
pub draws_seen: u64,
|
||
pub swaps_seen: u64,
|
||
pub interrupts_emitted: u64,
|
||
pub wait_reg_mem_blocks: u64,
|
||
pub indirect_buffer_jumps: u64,
|
||
/// P4: count of EDRAM→memory resolves triggered by `TILE_FLUSH` events
|
||
/// (event code 15). Non-zero means the game is committing rendered
|
||
/// pixels to the frontbuffer / a texture.
|
||
pub resolves_total: u64,
|
||
/// Resolves whose byte copy path ran and wrote at least one sample to
|
||
/// guest memory. Delta against `resolves_total` indicates how many
|
||
/// resolves were skipped for an unsupported format / MSAA mode / 3D
|
||
/// destination.
|
||
pub resolves_copied_total: u64,
|
||
/// Resolves that were skipped by [`crate::resolve::copy_to_memory`] due
|
||
/// to an unsupported format path. Logged at `warn` so the reason is
|
||
/// visible.
|
||
pub resolves_skipped_total: u64,
|
||
/// Total number of 32bpp samples written into guest memory across all
|
||
/// successful resolves. Useful for sanity-checking that a big splash
|
||
/// frame actually made it out (e.g. 1280×720 = 921_600 samples).
|
||
pub resolve_samples_written: u64,
|
||
/// P4: unique render-target keys seen (as managed by the internal
|
||
/// `RenderTargetCache`). Useful HUD metric for multi-target workloads.
|
||
pub unique_render_targets: u64,
|
||
}
|
||
|
||
/// Result of one packet step.
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||
pub enum ExecOutcome {
|
||
/// Consumed one packet; GPU remains Ready.
|
||
Stepped { dwords_consumed: u32 },
|
||
/// Nothing to do right now.
|
||
Idle,
|
||
/// Parked on a sync primitive; `GpuSystem::pending_block` has details.
|
||
Blocked,
|
||
}
|
||
|
||
/// Shader fetch constants: the game uploads them via `PM4_SET_CONSTANT` type=1
|
||
/// into a 256-dword region. Games then reference them by index when binding
|
||
/// textures / vertex buffers.
|
||
pub const CONST_BASE_ALU: u32 = 0x4000;
|
||
pub const CONST_BASE_FETCH: u32 = 0x4800;
|
||
pub const CONST_BASE_BOOL: u32 = 0x4900;
|
||
pub const CONST_BASE_LOOP: u32 = 0x4908;
|
||
pub const CONST_BASE_REGISTERS: u32 = 0x2000;
|
||
|
||
/// Atomic mailbox for the handful of GPU registers that CROSS the MMIO
|
||
/// boundary. Guests write into the `0x7FC80000` register aperture; those
|
||
/// writes run through [`crate::mmio_region`] and land in these atomics.
|
||
/// Inside `execute_one` / the scheduler's per-round GPU hook we sample them
|
||
/// to sync `ring.write_offset_dwords`, reflect progress back to the guest,
|
||
/// etc.
|
||
///
|
||
/// Only these three registers need atomic cross-thread access. Everything
|
||
/// else lives in [`GpuSystem::register_file`] which is CPU-thread-local.
|
||
#[derive(Debug, Clone)]
|
||
pub struct GpuMmio {
|
||
/// `CP_RB_WPTR` — guest writes dword offset of the write pointer.
|
||
pub cp_rb_wptr: Arc<AtomicU32>,
|
||
/// `CP_RB_RPTR` — we mirror our internal `ring.read_offset_dwords` here
|
||
/// so guests polling the register see progress.
|
||
pub cp_rb_rptr: Arc<AtomicU32>,
|
||
/// `CP_INT_STATUS` — bit set when an interrupt is pending.
|
||
pub cp_int_status: Arc<AtomicU32>,
|
||
/// `CP_INT_ACK` — guest clears the bit after handling.
|
||
pub cp_int_ack: Arc<AtomicU32>,
|
||
/// `D1MODE_VBLANK_VLINE_STATUS` (register `0x1951`, MMIO offset `0x6544`).
|
||
/// Bit 0 = `VBLANK_INT_OCCURRED` — set by the GPU when vblank fires,
|
||
/// cleared by the guest via write-1-to-clear. Sylpheed's vsync callback
|
||
/// gates *all* its work on reading bit 0 as set: `lwz; rlwinm. r,r,0,31,31;
|
||
/// bc 12,2,skip`. Without this bit toggling across vsyncs the callback
|
||
/// always skips, so the PKEVENT that feeds the render dispatcher
|
||
/// (user_data + 0x3B28) never gets signaled and the worker loops
|
||
/// forever.
|
||
pub d1mode_vblank_vline_status: Arc<AtomicU32>,
|
||
/// M1.7 parker — set by producers (guest WPTR writes, shutdown) so
|
||
/// the GPU worker thread does not park when work is pending. The
|
||
/// worker swaps to `false` on entering its park decision and
|
||
/// re-checks predicates; if a producer raced between the swap and
|
||
/// the actual `park_timeout`, the producer's `unpark()` returns the
|
||
/// park immediately via std's token semantics. Inline mode never
|
||
/// reads this; the cost is one extra atomic store per WPTR write.
|
||
pub wake_pending: Arc<AtomicBool>,
|
||
/// Handle to the GPU worker thread, populated by `GpuWorker::run` on
|
||
/// startup. The MMIO write callback for `CP_RB_WPTR` `unpark()`s it
|
||
/// after every guest WPTR write so the worker proceeds without
|
||
/// waiting for its `park_timeout`. `None` in inline mode (no worker
|
||
/// to wake), in which case the unpark site is a one-mutex-lock
|
||
/// no-op.
|
||
pub worker_thread: Arc<std::sync::Mutex<Option<std::thread::Thread>>>,
|
||
}
|
||
|
||
impl GpuMmio {
|
||
pub fn new() -> Self {
|
||
Self {
|
||
cp_rb_wptr: Arc::new(AtomicU32::new(0)),
|
||
cp_rb_rptr: Arc::new(AtomicU32::new(0)),
|
||
cp_int_status: Arc::new(AtomicU32::new(0)),
|
||
cp_int_ack: Arc::new(AtomicU32::new(0)),
|
||
d1mode_vblank_vline_status: Arc::new(AtomicU32::new(0)),
|
||
wake_pending: Arc::new(AtomicBool::new(false)),
|
||
worker_thread: Arc::new(std::sync::Mutex::new(None)),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl Default for GpuMmio {
|
||
fn default() -> Self {
|
||
Self::new()
|
||
}
|
||
}
|
||
|
||
/// Live GPU system. One instance per `KernelState`.
|
||
pub struct GpuSystem {
|
||
pub register_file: RegisterFile,
|
||
pub ring: RingBufferView,
|
||
/// Stack of saved rings for `PM4_INDIRECT_BUFFER` nesting. The active
|
||
/// ring is always `ring`; when an IB packet arrives, we push `ring` onto
|
||
/// this stack and replace `ring` with the IB view. On IB completion
|
||
/// (read pointer catches up to size), we pop.
|
||
ib_stack: Vec<RingBufferView>,
|
||
/// Cached shader blobs keyed by the raw CP register address that loaded them.
|
||
pub shader_blobs: HashMap<u32, ShaderBlob>,
|
||
/// P8 — FIFO of blob keys for bounded eviction. On `IM_LOAD*` the
|
||
/// new key is pushed to the back; if the blob count exceeds
|
||
/// [`SHADER_BLOB_CAP`], the front is popped and removed from
|
||
/// `shader_blobs`. Prevents long-running guests from growing the
|
||
/// cache without bound. The two *active* keys (`active_vs_key` +
|
||
/// `active_ps_key`) are never evicted — safeguard in `evict_oldest`.
|
||
pub shader_blob_order: std::collections::VecDeque<u32>,
|
||
/// Monotonic frame counter (bumped on `PM4_XE_SWAP`).
|
||
pub swap_counter: u64,
|
||
/// Most recent swap notification; the kernel polls this after `execute_one`
|
||
/// to decide whether to push a UI swap event.
|
||
pub last_swap: Option<SwapNotification>,
|
||
/// Queue of interrupts not yet delivered to the guest. Private so that
|
||
/// callers go through [`Self::take_pending_interrupts`] — M1 step 6
|
||
/// then redirects this drain into a `crossbeam_channel::Sender` without
|
||
/// re-touching every call site.
|
||
pending_interrupts: Vec<PendingInterrupt>,
|
||
/// Current stall reason, if any.
|
||
pub pending_block: Option<GpuBlock>,
|
||
pub stats: GpuStats,
|
||
/// For the 64-bit bin mask/select we split hi/lo writes.
|
||
pub bin_mask: u64,
|
||
pub bin_select: u64,
|
||
/// Shared-atomic mailbox for the `0x7FC80000` MMIO aperture. Cloned into
|
||
/// the [`crate::mmio_region`] callbacks; clone-ownership keeps the bus
|
||
/// side and the executor side in sync without locks.
|
||
pub mmio: GpuMmio,
|
||
/// Most recent draw state extracted at a `PM4_DRAW_INDX*` packet. The
|
||
/// uber-shader pipeline in P3+ reads this to build its wgpu draw call.
|
||
pub last_draw: Option<DrawState>,
|
||
/// Most recent processed primitive — index rewrite + host topology
|
||
/// decision. Separate from `last_draw` because its `rewritten_indices`
|
||
/// may be large and callers may want to drop it after consumption.
|
||
pub last_primitive: Option<ProcessedPrimitive>,
|
||
/// Key in `shader_blobs` of the currently-active vertex shader. Set by
|
||
/// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kVertex` and read at
|
||
/// `PM4_DRAW_INDX*` time so the host side can upload the matching
|
||
/// microcode bytes before dispatching.
|
||
pub active_vs_key: Option<u32>,
|
||
/// Key in `shader_blobs` of the currently-active pixel shader. Set by
|
||
/// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kPixel`.
|
||
pub active_ps_key: Option<u32>,
|
||
/// P4: EDRAM tile-ownership bookkeeping + per-key RT descriptors. Updated
|
||
/// at each `PM4_DRAW_INDX*` (`bind` + `claim_tiles`) and queried by
|
||
/// `TILE_FLUSH` event handling to decide resolve sources.
|
||
pub rt_cache: crate::render_target_cache::RenderTargetCache,
|
||
/// P4: most recent `ResolveInfo` observed when `TILE_FLUSH` fired. The UI
|
||
/// bridge surfaces this in the HUD so users can tell when a game is
|
||
/// resolving to the frontbuffer versus an off-screen target.
|
||
pub last_resolve: Option<crate::draw_state::ResolveInfo>,
|
||
/// P5: CPU-side decoded-texture cache (shared across draws within a
|
||
/// frame; trimmed implicitly by insertion). `ensure_cached` hits this
|
||
/// on every texture-fetch resolution; the UI thread sees the decoded
|
||
/// bytes via `UiBridge::publish_texture`.
|
||
pub texture_cache: crate::texture_cache::TextureCache,
|
||
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
|
||
/// (future) host-render-target readback; read by the resolve byte-copy
|
||
/// path that writes tiled pixels into guest memory. Allocated once at
|
||
/// `GpuSystem::new` and lives for the whole GPU lifetime — no
|
||
/// per-frame churn.
|
||
pub edram: crate::edram::ShadowEdram,
|
||
}
|
||
|
||
impl GpuSystem {
|
||
pub fn new() -> Self {
|
||
Self {
|
||
register_file: RegisterFile::new(),
|
||
ring: RingBufferView::new(),
|
||
ib_stack: Vec::new(),
|
||
shader_blobs: HashMap::new(),
|
||
shader_blob_order: std::collections::VecDeque::with_capacity(SHADER_BLOB_CAP + 1),
|
||
swap_counter: 0,
|
||
last_swap: None,
|
||
pending_interrupts: Vec::new(),
|
||
pending_block: None,
|
||
stats: GpuStats::default(),
|
||
bin_mask: 0,
|
||
bin_select: 0,
|
||
mmio: GpuMmio::new(),
|
||
last_draw: None,
|
||
last_primitive: None,
|
||
active_vs_key: None,
|
||
active_ps_key: None,
|
||
rt_cache: crate::render_target_cache::RenderTargetCache::new(),
|
||
last_resolve: None,
|
||
texture_cache: crate::texture_cache::TextureCache::new(),
|
||
edram: crate::edram::ShadowEdram::new(),
|
||
}
|
||
}
|
||
|
||
/// P8 — insert a shader blob + bump the FIFO so long-running games
|
||
/// don't grow `shader_blobs` without bound. Caps at [`SHADER_BLOB_CAP`].
|
||
/// Never evicts the currently-active VS/PS blobs (if they ended up at
|
||
/// the front of the queue, we skip past them).
|
||
fn insert_shader_blob(&mut self, key: u32, blob: ShaderBlob) {
|
||
let already_present = self.shader_blobs.contains_key(&key);
|
||
self.shader_blobs.insert(key, blob);
|
||
if !already_present {
|
||
self.shader_blob_order.push_back(key);
|
||
metrics::counter!("gpu.shader.blob_seen").increment(1);
|
||
}
|
||
while self.shader_blobs.len() > SHADER_BLOB_CAP {
|
||
// Pop the oldest key that isn't one of the active ones.
|
||
let mut evicted = None;
|
||
for _ in 0..self.shader_blob_order.len() {
|
||
if let Some(candidate) = self.shader_blob_order.pop_front() {
|
||
if Some(candidate) == self.active_vs_key
|
||
|| Some(candidate) == self.active_ps_key
|
||
{
|
||
self.shader_blob_order.push_back(candidate);
|
||
continue;
|
||
}
|
||
self.shader_blobs.remove(&candidate);
|
||
evicted = Some(candidate);
|
||
break;
|
||
}
|
||
}
|
||
if evicted.is_some() {
|
||
metrics::counter!("gpu.shader.blob_evicted").increment(1);
|
||
} else {
|
||
// All remaining blobs are active — can't evict, stop.
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
/// P4: event dispatch helper called by every `PM4_EVENT_WRITE*` variant.
|
||
/// `event_code` is the low 6 bits of the initiator word (see canary's
|
||
/// `xenos::Event` enum — code 15 is `TILE_FLUSH`, the resolve trigger).
|
||
///
|
||
/// Handle a `PM4_EVENT_WRITE*` initiator. For `TILE_FLUSH` (event 15)
|
||
/// we decode the live `RB_*` register state into a [`ResolveInfo`],
|
||
/// paint any clear values into the shadow EDRAM, and then copy bytes
|
||
/// from the source render target into guest memory at
|
||
/// `RB_COPY_DEST_BASE`. That last step is what `VdSwap` needs to see
|
||
/// non-empty front-buffer pixels — see `memory/project_xenia_rs_edram
|
||
/// _resolve_gap.md` for the history of this path.
|
||
fn handle_event_initiator(&mut self, event_code: u32, mem: &dyn MemoryAccess) {
|
||
const EVENT_TILE_FLUSH: u32 = 15;
|
||
if event_code != EVENT_TILE_FLUSH {
|
||
return;
|
||
}
|
||
let info = draw_state::ResolveInfo::from_register_file_and_memory(
|
||
&self.register_file,
|
||
mem,
|
||
);
|
||
self.stats.resolves_total += 1;
|
||
metrics::counter!(
|
||
"gpu.resolve",
|
||
"src" => format!("{}", info.copy_src_select),
|
||
"fmt" => format!("{}", info.dest_format),
|
||
"cmd" => format!("{}", info.copy_command),
|
||
)
|
||
.increment(1);
|
||
tracing::info!(
|
||
src = info.copy_src_select,
|
||
dst_base = format_args!("{:#010x}", info.dest_base),
|
||
w = info.coords.width,
|
||
h = info.coords.height,
|
||
pitch = info.dest_pitch_pixels,
|
||
fmt = info.dest_format,
|
||
endian = info.dest_endian,
|
||
clear_color = info.color_clear_enable,
|
||
clear_depth = info.depth_clear_enable,
|
||
"gpu: TILE_FLUSH resolve"
|
||
);
|
||
|
||
// Paint clear values into the shadow EDRAM at the source tile
|
||
// range *before* the copy. Games often issue a clear-then-resolve
|
||
// as a single TILE_FLUSH: the EDRAM is filled with `RB_COLOR_CLEAR`
|
||
// by the clear part, and that's what the copy part reads.
|
||
//
|
||
// Sample coordinates are pixel coordinates scaled up by
|
||
// `sample_count_log2_x/y` — for 1x MSAA (Sylpheed) this is the
|
||
// identity.
|
||
if info.color_clear_enable
|
||
&& let draw_state::ResolveSource::Color(_) = info.source
|
||
&& info.surface_pitch_tiles > 0
|
||
{
|
||
let sx = info.coords.x0 << info.coords.sample_count_log2_x;
|
||
let sy = info.coords.y0 << info.coords.sample_count_log2_y;
|
||
let sw = info.coords.width << info.coords.sample_count_log2_x;
|
||
let sh = info.coords.height << info.coords.sample_count_log2_y;
|
||
// 64bpp clears use both `RB_COLOR_CLEAR_LO` (low 32 bits) and
|
||
// `RB_COLOR_CLEAR` (high 32 bits) per Canary `draw_util.cc:1302-1303`.
|
||
// 32bpp clears ignore the lo word entirely.
|
||
if info.source_is_64bpp {
|
||
self.edram.fill_rect_64bpp(
|
||
info.source_base_tiles,
|
||
info.surface_pitch_tiles,
|
||
sx,
|
||
sy,
|
||
sw,
|
||
sh,
|
||
info.color_clear_value_lo,
|
||
info.color_clear_value,
|
||
);
|
||
} else {
|
||
self.edram.fill_rect_32bpp(
|
||
info.source_base_tiles,
|
||
info.surface_pitch_tiles,
|
||
sx,
|
||
sy,
|
||
sw,
|
||
sh,
|
||
info.color_clear_value,
|
||
);
|
||
}
|
||
}
|
||
if info.depth_clear_enable && info.surface_pitch_tiles > 0 {
|
||
let sx = info.coords.x0 << info.coords.sample_count_log2_x;
|
||
let sy = info.coords.y0 << info.coords.sample_count_log2_y;
|
||
let sw = info.coords.width << info.coords.sample_count_log2_x;
|
||
let sh = info.coords.height << info.coords.sample_count_log2_y;
|
||
// Depth tiles live at RB_DEPTH_INFO.depth_base regardless of
|
||
// which source this resolve selects.
|
||
let rb_depth_info = self.register_file.read(draw_state::reg::RB_DEPTH_INFO);
|
||
let depth_base = (rb_depth_info & 0xFFF) as u16;
|
||
self.edram.fill_rect_32bpp(
|
||
depth_base,
|
||
info.surface_pitch_tiles,
|
||
sx,
|
||
sy,
|
||
sw,
|
||
sh,
|
||
info.depth_clear_value,
|
||
);
|
||
}
|
||
|
||
// Byte copy into guest memory.
|
||
let stats = crate::resolve::copy_to_memory(&info, &self.edram, mem);
|
||
if stats.supported && stats.samples_written > 0 {
|
||
self.stats.resolves_copied_total += 1;
|
||
self.stats.resolve_samples_written += stats.samples_written as u64;
|
||
} else if !stats.supported {
|
||
self.stats.resolves_skipped_total += 1;
|
||
}
|
||
|
||
self.last_resolve = Some(info);
|
||
}
|
||
|
||
/// Sync state with the MMIO atomic mailbox. Call once at the top of the
|
||
/// per-round GPU hook: the guest may have advanced `CP_RB_WPTR` since
|
||
/// we last ran, and we in turn reflect our read-pointer back to the
|
||
/// mirror register so the guest sees progress.
|
||
///
|
||
/// GPUBUG-006: under `--parallel`, the producer (the guest CP_RB_WPTR
|
||
/// MMIO write) uses `Release` to publish prior ring-memory writes;
|
||
/// the consumer here must `Acquire`-load to pair correctly. With
|
||
/// Relaxed-on-load, ring-memory writes that the guest performed
|
||
/// before bumping WPTR could be reordered past our subsequent reads
|
||
/// — leading to garbage PM4 packet contents. The producer side at
|
||
/// `mmio_region.rs:78` already uses Release; the consumer's Relaxed
|
||
/// was the missing half. Symmetrically, the RPTR mirror store
|
||
/// publishes our read progress to the guest and benefits from a
|
||
/// Release.
|
||
pub fn sync_with_mmio(&mut self) {
|
||
let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire);
|
||
if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 {
|
||
self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords;
|
||
}
|
||
// Mirror our read pointer (Release pairs with any guest-side
|
||
// Acquire-load of CP_RB_RPTR for ring writeback bookkeeping).
|
||
self.mmio
|
||
.cp_rb_rptr
|
||
.store(self.ring.read_offset_dwords, Ordering::Release);
|
||
}
|
||
|
||
/// True iff `execute_one` is expected to make progress without blocking.
|
||
pub fn is_ready(&self, mem: &dyn MemoryAccess) -> bool {
|
||
if let Some(block) = &self.pending_block {
|
||
return block.is_satisfied(mem, &self.register_file);
|
||
}
|
||
self.ring.has_pending()
|
||
}
|
||
|
||
/// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when
|
||
/// there's nothing to do, [`ExecOutcome::Blocked`] if a sync primitive
|
||
/// stalls the GPU, otherwise [`ExecOutcome::Stepped`] with the number of
|
||
/// dwords consumed (counting the header).
|
||
pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome {
|
||
// 0) If currently parked, probe the condition and either wake up or stay blocked.
|
||
if let Some(block) = self.pending_block.clone() {
|
||
if block.is_satisfied(mem, &self.register_file) {
|
||
tracing::debug!(?block, "gpu: wait satisfied — resuming");
|
||
self.pending_block = None;
|
||
} else {
|
||
return ExecOutcome::Blocked;
|
||
}
|
||
}
|
||
if !self.ring.has_pending() {
|
||
// End of current ring. If we were inside an indirect buffer, pop
|
||
// and resume the caller.
|
||
if let Some(caller) = self.ib_stack.pop() {
|
||
self.ring = caller;
|
||
if self.ring.has_pending() {
|
||
return self.execute_one(mem);
|
||
}
|
||
}
|
||
return ExecOutcome::Idle;
|
||
}
|
||
let header_addr = self.ring.addr_at_offset(0).unwrap();
|
||
let header_word = mem.read_u32(header_addr);
|
||
let packet = pm4::decode(header_word);
|
||
tracing::trace!(
|
||
header = format_args!("{header_word:#010x}"),
|
||
addr = format_args!("{header_addr:#010x}"),
|
||
?packet.kind,
|
||
"gpu: packet"
|
||
);
|
||
let consumed = match packet.kind {
|
||
PacketKind::Type0 { base_index, count, write_one } => {
|
||
self.handle_type0(mem, base_index, count, write_one, packet.total_dwords)
|
||
}
|
||
PacketKind::Type1 { reg_index_1, reg_index_2 } => {
|
||
self.handle_type1(mem, reg_index_1, reg_index_2)
|
||
}
|
||
PacketKind::Type2 => 1,
|
||
PacketKind::Type3 {
|
||
opcode,
|
||
count,
|
||
predicated,
|
||
} => match self.handle_type3(mem, opcode, count, predicated, packet.total_dwords) {
|
||
Type3Result::Consumed(n) => n,
|
||
Type3Result::Blocked { rewind_to_header } => {
|
||
// Re-park on this packet so the resume path re-reads it.
|
||
if rewind_to_header {
|
||
// We haven't moved read ptr yet, so this is a no-op —
|
||
// documented to keep intent explicit.
|
||
}
|
||
return ExecOutcome::Blocked;
|
||
}
|
||
},
|
||
};
|
||
self.ring.advance_read(consumed);
|
||
self.writeback_read_ptr(mem);
|
||
self.stats.packets_executed += 1;
|
||
ExecOutcome::Stepped {
|
||
dwords_consumed: consumed,
|
||
}
|
||
}
|
||
|
||
/// First-Pixels M2b — kernel-side commit point for `VdSwap`. Prior to
|
||
/// M2b the kernel's [`vd_swap`] wrote a synthetic `PM4_XE_SWAP` packet
|
||
/// at the guest-provided `buffer_ptr` + advanced our ring `wptr` by 64
|
||
/// dwords, expecting the drain to pick it up. That mechanism misaligned:
|
||
/// the drain reads from `ring.base + rptr * 4` forward, not from the
|
||
/// game's out-of-band `buffer_ptr`. 512 ring packets executed through
|
||
/// 1 B guest instructions but `swaps_seen` stayed at 0.
|
||
///
|
||
/// `VdSwap` is the kernel's commit point by definition — we don't need
|
||
/// to launder the event through the ring. Call this directly from the
|
||
/// kernel-side handler; the `PM4_XE_SWAP` opcode path still works for
|
||
/// the (rare) case of a game that emits the packet through its own ring
|
||
/// writes.
|
||
pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) {
|
||
self.stats.swaps_seen += 1;
|
||
self.swap_counter = self.swap_counter.wrapping_add(1);
|
||
self.last_swap = Some(SwapNotification {
|
||
frame_index: self.swap_counter,
|
||
frontbuffer_phys,
|
||
width,
|
||
height,
|
||
});
|
||
self.pending_interrupts.push(PendingInterrupt {
|
||
source: InterruptSource::Swap,
|
||
cpu_mask: 0x1,
|
||
});
|
||
tracing::info!(
|
||
frame = self.swap_counter,
|
||
fb = format_args!("{frontbuffer_phys:#010x}"),
|
||
width,
|
||
height,
|
||
"gpu: XE_SWAP (kernel-direct)"
|
||
);
|
||
}
|
||
|
||
/// Called by `VdInitializeRingBuffer` to give us the primary ring.
|
||
pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
|
||
let size_bytes = 1u32 << size_log2.min(31);
|
||
self.ring.base = base;
|
||
self.ring.size_dwords = size_bytes / 4;
|
||
self.ring.read_offset_dwords = 0;
|
||
// `write_offset` is driven by the guest — start at 0 so the ring
|
||
// appears empty until MMIO writes advance it.
|
||
self.ring.write_offset_dwords = 0;
|
||
tracing::info!(
|
||
base = format_args!("{base:#010x}"),
|
||
size_bytes,
|
||
size_dwords = self.ring.size_dwords,
|
||
"gpu: ring initialized"
|
||
);
|
||
}
|
||
|
||
/// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest
|
||
/// expects us to mirror `read_offset_dwords`.
|
||
pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) {
|
||
self.ring.rptr_writeback_addr = addr;
|
||
self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31);
|
||
tracing::info!(
|
||
addr = format_args!("{addr:#010x}"),
|
||
block_dwords = self.ring.rptr_writeback_block_dwords,
|
||
"gpu: rptr writeback enabled"
|
||
);
|
||
}
|
||
|
||
/// Drain the pending-interrupt queue. The kernel calls this once per
|
||
/// scheduler round and queues each entry into `interrupts.queue_interrupt`.
|
||
///
|
||
/// M1 step 6 swaps the `Vec`-backed implementation for a
|
||
/// `crossbeam_channel::Sender<PendingInterrupt>`. Routing every external
|
||
/// reader through this single accessor in step 2 means that swap is a
|
||
/// localized change — no call site changes.
|
||
///
|
||
/// Returns the previously-queued interrupts and leaves the internal queue
|
||
/// empty. Cheap (`Vec::take`); no allocation when the queue is already
|
||
/// empty.
|
||
pub fn take_pending_interrupts(&mut self) -> Vec<PendingInterrupt> {
|
||
std::mem::take(&mut self.pending_interrupts)
|
||
}
|
||
|
||
/// True when the pending-interrupt queue has at least one entry. Used
|
||
/// by callers that want to short-circuit an empty drain (saving the
|
||
/// `Vec::new()` allocation that `take` would otherwise force on every
|
||
/// scheduler round).
|
||
pub fn has_pending_interrupts(&self) -> bool {
|
||
!self.pending_interrupts.is_empty()
|
||
}
|
||
|
||
/// Extend the logical write pointer by `dwords` (cumulative). `VdSwap`
|
||
/// reserves 64 dwords then calls this; MMIO writes to `CP_RB_WPTR` will
|
||
/// do the same in P2+.
|
||
pub fn extend_write_ptr(&mut self, new_wptr_dwords: u32) {
|
||
if self.ring.size_dwords == 0 {
|
||
return;
|
||
}
|
||
self.ring.write_offset_dwords = new_wptr_dwords % self.ring.size_dwords;
|
||
}
|
||
|
||
/// Write the current read pointer back to the guest-registered
|
||
/// address. M1.8 uses the fenced variant: when the GPU runs on its
|
||
/// own host thread, the CPU can poll this RPTR mirror to learn how
|
||
/// far the GPU has consumed the ring; the Release fence ensures any
|
||
/// upstream packet effects (memory writes, register file updates
|
||
/// the guest reads via subsequent MMIO) happen-before the
|
||
/// CPU-visible RPTR bump.
|
||
fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) {
|
||
if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() {
|
||
mem.write_u32_fence(
|
||
self.ring.rptr_writeback_addr,
|
||
self.ring.read_offset_dwords,
|
||
);
|
||
}
|
||
}
|
||
|
||
// ── Type-0/1 handlers ─────────────────────────────────────────────────
|
||
|
||
fn handle_type0(
|
||
&mut self,
|
||
mem: &dyn MemoryAccess,
|
||
base_index: u32,
|
||
count: u32,
|
||
write_one: bool,
|
||
total_dwords: u32,
|
||
) -> u32 {
|
||
for i in 0..count {
|
||
let dword_addr = self.ring.addr_at_offset(1 + i).unwrap();
|
||
let value = mem.read_u32(dword_addr);
|
||
let target = if write_one { base_index } else { base_index + i };
|
||
self.register_file.write(target, value);
|
||
}
|
||
tracing::trace!(
|
||
base = format_args!("{base_index:#x}"),
|
||
count,
|
||
write_one,
|
||
"gpu: Type0 reg write run"
|
||
);
|
||
total_dwords
|
||
}
|
||
|
||
fn handle_type1(
|
||
&mut self,
|
||
mem: &dyn MemoryAccess,
|
||
reg_index_1: u32,
|
||
reg_index_2: u32,
|
||
) -> u32 {
|
||
let a_addr = self.ring.addr_at_offset(1).unwrap();
|
||
let b_addr = self.ring.addr_at_offset(2).unwrap();
|
||
let a = mem.read_u32(a_addr);
|
||
let b = mem.read_u32(b_addr);
|
||
self.register_file.write(reg_index_1, a);
|
||
self.register_file.write(reg_index_2, b);
|
||
tracing::trace!(
|
||
r1 = format_args!("{reg_index_1:#x}"),
|
||
r2 = format_args!("{reg_index_2:#x}"),
|
||
"gpu: Type1 dual reg write"
|
||
);
|
||
3
|
||
}
|
||
|
||
// ── Type-3 dispatch ───────────────────────────────────────────────────
|
||
|
||
fn handle_type3(
|
||
&mut self,
|
||
mem: &dyn MemoryAccess,
|
||
opcode: u8,
|
||
count: u32,
|
||
predicated: bool,
|
||
total_dwords: u32,
|
||
) -> Type3Result {
|
||
metrics::counter!("gpu.packet", "opcode" => pm4::type3_opcode_name(opcode)).increment(1);
|
||
tracing::trace!(
|
||
opcode = format_args!("{opcode:#x}"),
|
||
name = pm4::type3_opcode_name(opcode),
|
||
count,
|
||
predicated,
|
||
"gpu: Type3"
|
||
);
|
||
// If predicated and the bin mask/select combo evaluates to "skip",
|
||
// consume the whole packet (including data dwords) and move on. We
|
||
// don't emulate binning so bin_mask & bin_select is always 0 → we
|
||
// keep predicated packets in simplest form: execute them anyway. Most
|
||
// games don't use binning on Xenos. Observed in canary:
|
||
// `pm4_command_processor_implement.h:440-460`.
|
||
let _ = predicated;
|
||
|
||
match opcode {
|
||
pm4::PM4_NOP
|
||
| pm4::PM4_WAIT_FOR_IDLE
|
||
| pm4::PM4_CONTEXT_UPDATE
|
||
| pm4::PM4_INVALIDATE_STATE
|
||
| pm4::PM4_ME_INIT
|
||
| pm4::PM4_VIZ_QUERY
|
||
| pm4::PM4_SET_SHADER_BASES => {
|
||
// Classify-and-skip. State side effects (if any) are deferred.
|
||
}
|
||
pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => {
|
||
self.stats.indirect_buffer_jumps += 1;
|
||
let ib_ptr = self.read_payload(mem, 1);
|
||
let ib_size = self.read_payload(mem, 2);
|
||
// Advance past the IB header + payload before recursing so
|
||
// the return location is correct.
|
||
self.ring.advance_read(total_dwords);
|
||
self.writeback_read_ptr(mem);
|
||
// Push current ring, switch to IB view.
|
||
let caller = self.ring;
|
||
self.ib_stack.push(caller);
|
||
self.ring = RingBufferView {
|
||
base: ib_ptr & !3,
|
||
size_dwords: ib_size,
|
||
read_offset_dwords: 0,
|
||
write_offset_dwords: ib_size, // IB is fully-written at jump time
|
||
rptr_writeback_addr: 0,
|
||
rptr_writeback_block_dwords: 0,
|
||
};
|
||
tracing::debug!(
|
||
ib_ptr = format_args!("{ib_ptr:#010x}"),
|
||
ib_size,
|
||
"gpu: jump to indirect buffer"
|
||
);
|
||
return Type3Result::Consumed(0); // we already advanced
|
||
}
|
||
pm4::PM4_WAIT_REG_MEM => {
|
||
// Canary layout (pm4_command_processor_implement.h:699-755):
|
||
// payload[0] = wait_info (bit 4 = is_memory, bits 2:0 = cmp)
|
||
// payload[1] = poll address (register idx OR memory addr; bottom 2 bits = endian for memory)
|
||
// payload[2] = ref value
|
||
// payload[3] = mask
|
||
// payload[4] = wait (sleep hint, ignored)
|
||
let wait_info = self.read_payload(mem, 1);
|
||
let poll_addr_raw = self.read_payload(mem, 2);
|
||
let reference = self.read_payload(mem, 3);
|
||
let mask = self.read_payload(mem, 4);
|
||
let is_memory = (wait_info & 0x10) != 0;
|
||
let cmp = WaitCmp::from_wait_info(wait_info);
|
||
let poll_addr = if is_memory {
|
||
poll_addr_raw & !3
|
||
} else {
|
||
poll_addr_raw
|
||
};
|
||
let block = GpuBlock::WaitRegMem {
|
||
poll_addr,
|
||
is_memory,
|
||
reference,
|
||
mask,
|
||
cmp,
|
||
};
|
||
if block.is_satisfied(mem, &self.register_file) {
|
||
// Condition already true; proceed past this packet.
|
||
tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied");
|
||
} else {
|
||
self.stats.wait_reg_mem_blocks += 1;
|
||
tracing::debug!(?block, "gpu: WAIT_REG_MEM parking");
|
||
self.pending_block = Some(block);
|
||
return Type3Result::Blocked { rewind_to_header: true };
|
||
}
|
||
}
|
||
pm4::PM4_REG_RMW => {
|
||
// payload[0] = rmw_info (bit 31 = and_from_reg, bit 30 = or_from_reg)
|
||
// payload[1] = and mask (or register index)
|
||
// payload[2] = or mask (or register index)
|
||
let rmw_info = self.read_payload(mem, 1);
|
||
let and_or_reg = (rmw_info & 0x8000_0000) != 0;
|
||
let or_from_reg = (rmw_info & 0x4000_0000) != 0;
|
||
let reg_index = rmw_info & 0x1FFF;
|
||
let p2 = self.read_payload(mem, 2);
|
||
let p3 = self.read_payload(mem, 3);
|
||
let and_mask = if and_or_reg {
|
||
self.register_file.read(p2 & 0x1FFF)
|
||
} else {
|
||
p2
|
||
};
|
||
let or_mask = if or_from_reg {
|
||
self.register_file.read(p3 & 0x1FFF)
|
||
} else {
|
||
p3
|
||
};
|
||
let cur = self.register_file.read(reg_index);
|
||
let new_value = (cur & and_mask) | or_mask;
|
||
self.register_file.write(reg_index, new_value);
|
||
tracing::trace!(
|
||
reg = format_args!("{reg_index:#x}"),
|
||
cur = format_args!("{cur:#x}"),
|
||
new = format_args!("{new_value:#x}"),
|
||
"gpu: REG_RMW"
|
||
);
|
||
}
|
||
pm4::PM4_REG_TO_MEM => {
|
||
// payload[0] = reg_index, payload[1] = mem addr
|
||
let reg_index = self.read_payload(mem, 1) & 0x1FFF;
|
||
let dst = self.read_payload(mem, 2) & !3;
|
||
let value = self.register_file.read(reg_index);
|
||
mem.write_u32(dst, value);
|
||
tracing::trace!(
|
||
reg = format_args!("{reg_index:#x}"),
|
||
dst = format_args!("{dst:#010x}"),
|
||
value = format_args!("{value:#x}"),
|
||
"gpu: REG_TO_MEM"
|
||
);
|
||
}
|
||
pm4::PM4_MEM_WRITE => {
|
||
// payload[0] = dst, payload[1..=count-1] = values
|
||
let mut dst = self.read_payload(mem, 1) & !3;
|
||
for i in 2..=count {
|
||
let val = self.read_payload(mem, i);
|
||
mem.write_u32(dst, val);
|
||
dst = dst.wrapping_add(4);
|
||
}
|
||
}
|
||
pm4::PM4_COND_WRITE => {
|
||
// payload[0] = wait_info, [1] = poll addr, [2] = ref, [3] = mask,
|
||
// [4] = write addr/reg, [5] = write data
|
||
let wait_info = self.read_payload(mem, 1);
|
||
let poll_raw = self.read_payload(mem, 2);
|
||
let reference = self.read_payload(mem, 3);
|
||
let mask = self.read_payload(mem, 4);
|
||
let is_memory = (wait_info & 0x10) != 0;
|
||
let cmp = WaitCmp::from_wait_info(wait_info);
|
||
let poll_addr = if is_memory { poll_raw & !3 } else { poll_raw };
|
||
let cur_raw = if is_memory {
|
||
mem.read_u32(poll_addr)
|
||
} else {
|
||
self.register_file.read(poll_addr)
|
||
};
|
||
if cmp.evaluate(cur_raw & mask, reference) {
|
||
let write_addr = self.read_payload(mem, 5);
|
||
let write_data = self.read_payload(mem, 6);
|
||
if (wait_info & 0x100) != 0 {
|
||
mem.write_u32(write_addr & !3, write_data);
|
||
} else {
|
||
self.register_file
|
||
.write(write_addr & 0x1FFF, write_data);
|
||
}
|
||
}
|
||
}
|
||
pm4::PM4_EVENT_WRITE => {
|
||
// payload[0] = initiator (written to VGT_EVENT_INITIATOR).
|
||
let initiator = self.read_payload(mem, 1);
|
||
self.register_file
|
||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||
self.handle_event_initiator(initiator & 0x3F, mem);
|
||
tracing::trace!(initiator = format_args!("{:#x}", initiator & 0x3F), "gpu: EVENT_WRITE");
|
||
}
|
||
pm4::PM4_EVENT_WRITE_SHD => {
|
||
// payload[0] = initiator (bit 31: write counter, else write `value`)
|
||
// payload[1] = address, payload[2] = value
|
||
let initiator = self.read_payload(mem, 1);
|
||
let address = self.read_payload(mem, 2);
|
||
let value = self.read_payload(mem, 3);
|
||
self.register_file
|
||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||
self.handle_event_initiator(initiator & 0x3F, mem);
|
||
let data = if (initiator & 0x8000_0000) != 0 {
|
||
self.swap_counter as u32
|
||
} else {
|
||
value
|
||
};
|
||
// M1.8: fenced write. The CPU thread busy-polls this
|
||
// address as a GPU completion fence. The Release fence
|
||
// emitted here pairs with `read_u32_fence`'s Acquire on
|
||
// the polling side: any earlier writes the worker
|
||
// performed (RPTR writeback, resolve target writes,
|
||
// etc.) are visible to the CPU once it sees the new
|
||
// fence value.
|
||
mem.write_u32_fence(address & !3, data);
|
||
tracing::trace!(
|
||
addr = format_args!("{:#010x}", address & !3),
|
||
value = format_args!("{data:#x}"),
|
||
"gpu: EVENT_WRITE_SHD"
|
||
);
|
||
}
|
||
pm4::PM4_EVENT_WRITE_EXT => {
|
||
// payload[0] = initiator, [1] = address. Writes 6 u16 extents
|
||
// (min/max x/y/z) — we're not tracking scissors yet, so write zeros.
|
||
let initiator = self.read_payload(mem, 1);
|
||
let address = self.read_payload(mem, 2) & !3;
|
||
self.register_file
|
||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||
self.handle_event_initiator(initiator & 0x3F, mem);
|
||
for i in 0..6u32 {
|
||
mem.write_u16(address + i * 2, 0);
|
||
}
|
||
}
|
||
pm4::PM4_EVENT_WRITE_ZPD => {
|
||
// Occlusion query writeback — always write zeros (no query).
|
||
let initiator = self.read_payload(mem, 1);
|
||
self.register_file
|
||
.write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
|
||
self.handle_event_initiator(initiator & 0x3F, mem);
|
||
}
|
||
pm4::PM4_DRAW_INDX | pm4::PM4_DRAW_INDX_2 => {
|
||
self.stats.draws_seen += 1;
|
||
// Canary (`pm4_command_processor_implement.h:1128-1151`):
|
||
// DRAW_INDX: payload[0] = viz_query, payload[1] = vgt_draw_initiator,
|
||
// [2] = dma_base (if source=DMA), [3] = dma_size
|
||
// DRAW_INDX_2: payload[0] = vgt_draw_initiator (indices follow inline).
|
||
let (vgt, dma_base, dma_size) = if opcode == pm4::PM4_DRAW_INDX {
|
||
let _viz = self.read_payload(mem, 1);
|
||
let vgt = self.read_payload(mem, 2);
|
||
let (db, ds) = if count >= 4 {
|
||
(Some(self.read_payload(mem, 3)), Some(self.read_payload(mem, 4)))
|
||
} else {
|
||
(None, None)
|
||
};
|
||
(vgt, db, ds)
|
||
} else {
|
||
(self.read_payload(mem, 1), None, None)
|
||
};
|
||
let mut ds = draw_state::extract(&self.register_file, vgt, dma_base, dma_size);
|
||
ds.vs_blob_key = self.active_vs_key;
|
||
ds.ps_blob_key = self.active_ps_key;
|
||
let processed = primitive::process(ds.primitive, ds.vertex_count, None);
|
||
metrics::counter!(
|
||
"gpu.draw",
|
||
"prim" => format!("{:?}", ds.primitive),
|
||
)
|
||
.increment(1);
|
||
if processed.rejected {
|
||
metrics::counter!("gpu.draw.rejected").increment(1);
|
||
}
|
||
// P4: update the render-target cache with every bound RT
|
||
// from this draw. Each bind either inserts a new key or
|
||
// refreshes an existing descriptor's bind_count. `msaa` is
|
||
// still hardcoded to 1× because we don't yet decode
|
||
// `PA_SC_AA_CONFIG`; P4b can add that.
|
||
let msaa = crate::render_target_cache::MsaaSamples::X1;
|
||
let mut viewport_height = ds.viewport.scale_y.abs() * 2.0;
|
||
if viewport_height <= 0.0 {
|
||
viewport_height = 720.0;
|
||
}
|
||
// 16 samples per tile row (64-sample 8×8 macroblocks pack
|
||
// 16 vertical samples per EDRAM tile).
|
||
let rows_of_tiles = (viewport_height as u32).div_ceil(16);
|
||
for (i, ci_opt) in ds.color_info.iter().enumerate() {
|
||
if let Some(ci) = ci_opt {
|
||
let pitch32 = ds.scissor.br_x.div_ceil(32);
|
||
let key = crate::render_target_cache::RenderTargetKey {
|
||
base_tiles: ci.base_tiles,
|
||
pitch_tiles_at_32bpp: pitch32,
|
||
msaa_samples: msaa,
|
||
is_depth: false,
|
||
resource_format: ci.format & 0xF,
|
||
};
|
||
let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
|
||
self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
|
||
metrics::counter!(
|
||
"gpu.rt.bind",
|
||
"slot" => format!("{i}"),
|
||
"fmt" => format!("{}", ci.format & 0xF),
|
||
)
|
||
.increment(1);
|
||
}
|
||
}
|
||
if let Some(depth) = ds.depth_info {
|
||
let pitch32 = ds.scissor.br_x.div_ceil(32);
|
||
let key = crate::render_target_cache::RenderTargetKey {
|
||
base_tiles: depth.base_tiles,
|
||
pitch_tiles_at_32bpp: pitch32,
|
||
msaa_samples: msaa,
|
||
is_depth: true,
|
||
resource_format: depth.format & 0xF,
|
||
};
|
||
let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
|
||
self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
|
||
}
|
||
self.stats.unique_render_targets = self.rt_cache.len() as u64;
|
||
tracing::debug!(
|
||
opcode = format_args!("{opcode:#x}"),
|
||
prim = ?ds.primitive,
|
||
verts = ds.vertex_count,
|
||
?processed.topology,
|
||
rewritten = processed.rewritten_indices.is_some(),
|
||
"gpu: DRAW_INDX captured"
|
||
);
|
||
self.last_draw = Some(ds);
|
||
self.last_primitive = Some(processed);
|
||
}
|
||
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
||
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
||
let offset_type = self.read_payload(mem, 1);
|
||
let index = offset_type & 0x7FF;
|
||
let const_type = (offset_type >> 16) & 0xFF;
|
||
let base = match const_type {
|
||
0 => CONST_BASE_ALU,
|
||
1 => CONST_BASE_FETCH,
|
||
2 => CONST_BASE_BOOL,
|
||
3 => CONST_BASE_LOOP,
|
||
4 => CONST_BASE_REGISTERS,
|
||
_ => CONST_BASE_ALU, // defensive default
|
||
};
|
||
for i in 0..(count - 1) {
|
||
let v = self.read_payload(mem, 2 + i);
|
||
self.register_file.write(base + index + i, v);
|
||
}
|
||
}
|
||
pm4::PM4_SET_CONSTANT2 => {
|
||
// payload[0] = 16-bit index; subsequent payloads write consecutive regs.
|
||
let index = self.read_payload(mem, 1) & 0xFFFF;
|
||
for i in 0..(count - 1) {
|
||
let v = self.read_payload(mem, 2 + i);
|
||
self.register_file.write(index + i, v);
|
||
}
|
||
}
|
||
pm4::PM4_LOAD_ALU_CONSTANT => {
|
||
// payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords
|
||
let src = self.read_payload(mem, 1) & !3;
|
||
let offset_type = self.read_payload(mem, 2);
|
||
let size_dwords = self.read_payload(mem, 3);
|
||
let index = offset_type & 0x7FF;
|
||
let const_type = (offset_type >> 16) & 0xFF;
|
||
let base = match const_type {
|
||
0 => CONST_BASE_ALU,
|
||
1 => CONST_BASE_FETCH,
|
||
2 => CONST_BASE_BOOL,
|
||
3 => CONST_BASE_LOOP,
|
||
4 => CONST_BASE_REGISTERS,
|
||
_ => CONST_BASE_ALU,
|
||
};
|
||
for i in 0..size_dwords {
|
||
let v = mem.read_u32(src + i * 4);
|
||
self.register_file.write(base + index + i, v);
|
||
}
|
||
}
|
||
pm4::PM4_IM_LOAD | pm4::PM4_IM_LOAD_IMMEDIATE => {
|
||
// Canary (pm4_command_processor_implement.h:1271-1330):
|
||
// IM_LOAD payload: [0] addr_type, [1] start_size
|
||
// IM_LOAD_IMMEDIATE payload: [0] shader_type, [1] start_size, [2..count] = microcode
|
||
let shader_type = self.read_payload(mem, 1) as u8 & 0x3;
|
||
let start_size = self.read_payload(mem, 2);
|
||
let size_dwords = start_size & 0xFFFF;
|
||
let blob = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
|
||
let mut v = Vec::with_capacity(size_dwords as usize);
|
||
for i in 0..size_dwords {
|
||
v.push(self.read_payload(mem, 3 + i));
|
||
}
|
||
v
|
||
} else {
|
||
let addr = self.read_payload(mem, 1) & !3;
|
||
let mut v = Vec::with_capacity(size_dwords as usize);
|
||
for i in 0..size_dwords {
|
||
v.push(mem.read_u32(addr + i * 4));
|
||
}
|
||
v
|
||
};
|
||
// For IM_LOAD the payload already carries an address that
|
||
// uniquely identifies this shader in guest memory, so the
|
||
// full `addr_type` dword (address | stage bits) makes a
|
||
// good cache key. For IM_LOAD_IMMEDIATE payload[0] is just
|
||
// the 2-bit shader_type — without a content-derived key
|
||
// every immediate upload would collide on 0 or 1 and
|
||
// thrash a single slot. Fold the microcode through a
|
||
// stable FNV-1a hash so per-content dedup still works.
|
||
let key = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
|
||
fnv1a_u32_dwords(shader_type as u32, &blob)
|
||
} else {
|
||
self.read_payload(mem, 1)
|
||
};
|
||
self.insert_shader_blob(
|
||
key,
|
||
ShaderBlob {
|
||
shader_type,
|
||
dwords: blob,
|
||
},
|
||
);
|
||
// P3b M1: record which blob is now "active" for the
|
||
// current stage. The uber-shader dispatch (xenia-ui) reads
|
||
// `active_vs_key`/`active_ps_key` at draw time to upload
|
||
// the right microcode. `shader_type`: 0 = vertex, 1 = pixel
|
||
// (per Xenos `ShaderType`).
|
||
match shader_type {
|
||
0 => self.active_vs_key = Some(key),
|
||
1 => self.active_ps_key = Some(key),
|
||
_ => {}
|
||
}
|
||
metrics::counter!(
|
||
"gpu.shader.blob_seen",
|
||
"stage" => if shader_type == 0 { "vs" } else { "ps" },
|
||
)
|
||
.increment(1);
|
||
tracing::debug!(
|
||
shader_type,
|
||
size_dwords,
|
||
key = format_args!("{key:#x}"),
|
||
"gpu: IM_LOAD (shader blob cached)"
|
||
);
|
||
}
|
||
pm4::PM4_SET_BIN_MASK_LO => {
|
||
self.bin_mask = (self.bin_mask & 0xFFFF_FFFF_0000_0000)
|
||
| (self.read_payload(mem, 1) as u64);
|
||
}
|
||
pm4::PM4_SET_BIN_MASK_HI => {
|
||
self.bin_mask = (self.bin_mask & 0x0000_0000_FFFF_FFFF)
|
||
| ((self.read_payload(mem, 1) as u64) << 32);
|
||
}
|
||
pm4::PM4_SET_BIN_MASK => {
|
||
let lo = self.read_payload(mem, 1) as u64;
|
||
let hi = self.read_payload(mem, 2) as u64;
|
||
self.bin_mask = (hi << 32) | lo;
|
||
}
|
||
pm4::PM4_SET_BIN_SELECT_LO => {
|
||
self.bin_select = (self.bin_select & 0xFFFF_FFFF_0000_0000)
|
||
| (self.read_payload(mem, 1) as u64);
|
||
}
|
||
pm4::PM4_SET_BIN_SELECT_HI => {
|
||
self.bin_select = (self.bin_select & 0x0000_0000_FFFF_FFFF)
|
||
| ((self.read_payload(mem, 1) as u64) << 32);
|
||
}
|
||
pm4::PM4_SET_BIN_SELECT => {
|
||
let lo = self.read_payload(mem, 1) as u64;
|
||
let hi = self.read_payload(mem, 2) as u64;
|
||
self.bin_select = (hi << 32) | lo;
|
||
}
|
||
pm4::PM4_INTERRUPT => {
|
||
let cpu_mask = self.read_payload(mem, 1);
|
||
self.stats.interrupts_emitted += 1;
|
||
self.pending_interrupts.push(PendingInterrupt {
|
||
source: InterruptSource::CommandProcessor,
|
||
cpu_mask,
|
||
});
|
||
tracing::debug!(
|
||
cpu_mask = format_args!("{cpu_mask:#x}"),
|
||
"gpu: PM4_INTERRUPT queued"
|
||
);
|
||
}
|
||
pm4::PM4_XE_SWAP => {
|
||
// Payload: [0] signature, [1] frontbuffer_phys, [2] width, [3] height
|
||
let _signature = self.read_payload(mem, 1);
|
||
let frontbuffer_phys = self.read_payload(mem, 2);
|
||
let width = self.read_payload(mem, 3);
|
||
let height = self.read_payload(mem, 4);
|
||
self.notify_xe_swap(frontbuffer_phys, width, height);
|
||
}
|
||
_ => {
|
||
// Unknown opcode — log once per opcode but don't stall.
|
||
tracing::warn!(
|
||
opcode = format_args!("{opcode:#x}"),
|
||
count,
|
||
"gpu: unhandled Type3 opcode"
|
||
);
|
||
}
|
||
}
|
||
Type3Result::Consumed(total_dwords)
|
||
}
|
||
|
||
/// Read dword at payload-relative offset `i` (where `i=0` is the header).
|
||
fn read_payload(&self, mem: &dyn MemoryAccess, i: u32) -> u32 {
|
||
let addr = self.ring.addr_at_offset(i).unwrap();
|
||
mem.read_u32(addr)
|
||
}
|
||
|
||
/// Drain up to `max_packets` (used by the kernel's VdSwap handler when we
|
||
/// don't yet have MMIO-triggered draining). Returns the number of
|
||
/// packets consumed.
|
||
pub fn drain(&mut self, mem: &dyn MemoryAccess, max_packets: u32) -> u32 {
|
||
let mut n = 0;
|
||
for _ in 0..max_packets {
|
||
match self.execute_one(mem) {
|
||
ExecOutcome::Stepped { .. } => n += 1,
|
||
ExecOutcome::Idle | ExecOutcome::Blocked => break,
|
||
}
|
||
}
|
||
n
|
||
}
|
||
|
||
/// Drain until the ring's read offset reaches `target_wptr` (modulo ring
|
||
/// size) or `execute_one` returns Idle/Blocked. Mirrors canary's
|
||
/// `WorkerThreadMain` (xenia-canary `command_processor.cc` ExecutePrimaryBuffer)
|
||
/// which loops on `read_ptr_index_ != write_ptr_index` with no packet
|
||
/// budget. `time_budget` bounds wall-clock so a pathological packet
|
||
/// (e.g. an EVENT_WRITE that perpetually re-blocks) cannot spin the
|
||
/// inline path; pass 900 ms to match the threaded `DrainFence` deadline.
|
||
/// Returns the number of packets consumed.
|
||
pub fn drain_until_wptr(
|
||
&mut self,
|
||
mem: &dyn MemoryAccess,
|
||
target_wptr: u32,
|
||
time_budget: Duration,
|
||
) -> u32 {
|
||
if self.ring.size_dwords == 0 {
|
||
return 0;
|
||
}
|
||
let target = target_wptr % self.ring.size_dwords;
|
||
let deadline = Instant::now() + time_budget;
|
||
let mut n = 0u32;
|
||
while self.ring.read_offset_dwords != target {
|
||
if Instant::now() >= deadline {
|
||
// Deadline exhaustion is the *expected* outcome under
|
||
// `--parallel` workloads (Sylpheed boot queues millions
|
||
// of game-batched IBs the inline drain can't chew
|
||
// through in 900 ms). Logged at debug because warn-level
|
||
// would fire on every vd_swap. Callers can re-read the
|
||
// ring read pointer to detect partial drain if they
|
||
// care.
|
||
tracing::debug!(
|
||
target,
|
||
rptr = self.ring.read_offset_dwords,
|
||
consumed = n,
|
||
"gpu: drain_until_wptr time-budget exhausted"
|
||
);
|
||
break;
|
||
}
|
||
match self.execute_one(mem) {
|
||
ExecOutcome::Stepped { .. } => {
|
||
n += 1;
|
||
// Mirror the threaded `DrainFence` handler at
|
||
// handle.rs:553-570: re-sync after every packet so
|
||
// any concurrent guest WPTR write (under `--parallel`)
|
||
// folds into the local ring view before the next
|
||
// `is_ready` check. Without this the local
|
||
// write_offset is a snapshot of the moment we entered
|
||
// the drain, which is fine for a target-WPTR drain
|
||
// but wrong if downstream packets (e.g. an indirect
|
||
// buffer's nested ring) need an updated view.
|
||
self.sync_with_mmio();
|
||
}
|
||
ExecOutcome::Idle | ExecOutcome::Blocked => break,
|
||
}
|
||
}
|
||
n
|
||
}
|
||
}
|
||
|
||
impl Default for GpuSystem {
|
||
fn default() -> Self {
|
||
Self::new()
|
||
}
|
||
}
|
||
|
||
/// Subset of Xenos registers we reference by name. Full table at
|
||
/// `xenia-canary/src/xenia/gpu/registers.h`.
|
||
pub mod reg {
|
||
//! All values below are Xenos *register indices* (the number you find in
|
||
//! canary's `register_table.inc`, i.e. the byte offset within the
|
||
//! aperture divided by 4). The MMIO aperture at `0x7FC8_0000` maps each
|
||
//! register at `APERTURE_BASE + index * 4`; the MMIO callbacks recover
|
||
//! the index with `(addr & 0xFFFF) / 4` before matching against these
|
||
//! constants.
|
||
|
||
/// `XE_GPU_REG_CP_RB_BASE` — ring buffer base address.
|
||
pub const CP_RB_BASE: u32 = 0x01C0;
|
||
/// `XE_GPU_REG_CP_RB_CNTL` — ring buffer control.
|
||
pub const CP_RB_CNTL: u32 = 0x01C1;
|
||
/// `XE_GPU_REG_CP_RB_RPTR_ADDR` — where to mirror read pointer.
|
||
pub const CP_RB_RPTR_ADDR: u32 = 0x01C3;
|
||
/// `XE_GPU_REG_CP_RB_RPTR` — read pointer (GPU → CPU).
|
||
pub const CP_RB_RPTR: u32 = 0x01C4;
|
||
/// `XE_GPU_REG_CP_RB_WPTR` — write pointer (CPU → GPU); MMIO side effect.
|
||
pub const CP_RB_WPTR: u32 = 0x01C5;
|
||
/// `XE_GPU_REG_CP_INT_STATUS` — interrupt status bits.
|
||
pub const CP_INT_STATUS: u32 = 0x01F3;
|
||
/// `XE_GPU_REG_CP_INT_ACK` — write-to-ack interrupt bits.
|
||
pub const CP_INT_ACK: u32 = 0x01F4;
|
||
/// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
|
||
/// Bit 0 = VBLANK_INT_OCCURRED.
|
||
pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
|
||
/// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
|
||
pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
|
||
/// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
|
||
/// (Canary `register_table.inc:530`).
|
||
pub const COHER_STATUS_HOST: u32 = 0x0A31;
|
||
}
|
||
|
||
/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a
|
||
/// stable, collision-resistant cache key for `IM_LOAD_IMMEDIATE` shader
|
||
/// blobs (where the guest supplies no natural address to key on).
|
||
fn fnv1a_u32_dwords(seed: u32, dwords: &[u32]) -> u32 {
|
||
const FNV_OFFSET: u32 = 0x811C_9DC5;
|
||
const FNV_PRIME: u32 = 0x0100_0193;
|
||
let mut hash = FNV_OFFSET;
|
||
for byte in seed.to_le_bytes() {
|
||
hash ^= byte as u32;
|
||
hash = hash.wrapping_mul(FNV_PRIME);
|
||
}
|
||
for dw in dwords {
|
||
for byte in dw.to_le_bytes() {
|
||
hash ^= byte as u32;
|
||
hash = hash.wrapping_mul(FNV_PRIME);
|
||
}
|
||
}
|
||
hash
|
||
}
|
||
|
||
/// Internal Type-3 handler result. Distinguishes "consumed a packet (by N
|
||
/// dwords)" from "blocked; don't advance read ptr".
|
||
enum Type3Result {
|
||
Consumed(u32),
|
||
Blocked { rewind_to_header: bool },
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use xenia_memory::GuestMemory;
|
||
use xenia_memory::page_table::MemoryProtect;
|
||
|
||
fn build_mem() -> GuestMemory {
|
||
let mut mem = GuestMemory::new().unwrap();
|
||
let rw = MemoryProtect::READ | MemoryProtect::WRITE;
|
||
mem.alloc(0x4000_0000, 0x4000, rw).unwrap();
|
||
mem
|
||
}
|
||
|
||
#[test]
|
||
fn ready_when_ring_has_pending() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mem = build_mem();
|
||
assert!(!gpu.is_ready(&mem));
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10); // 1024 bytes = 256 dwords
|
||
assert!(!gpu.is_ready(&mem));
|
||
gpu.extend_write_ptr(4);
|
||
assert!(gpu.is_ready(&mem));
|
||
}
|
||
|
||
#[test]
|
||
fn type2_nop_advances_read_pointer() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
// 256 dwords ring at 0x40000000
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// Push 3 Type-2 NOPs
|
||
for i in 0..3u32 {
|
||
mem.write_u32(0x4000_0000 + i * 4, 0x8000_0000);
|
||
}
|
||
gpu.extend_write_ptr(3);
|
||
for _ in 0..3 {
|
||
match gpu.execute_one(&mut mem) {
|
||
ExecOutcome::Stepped { dwords_consumed } => assert_eq!(dwords_consumed, 1),
|
||
other => panic!("unexpected {:?}", other),
|
||
}
|
||
}
|
||
assert_eq!(gpu.ring.read_offset_dwords, 3);
|
||
assert_eq!(gpu.stats.packets_executed, 3);
|
||
}
|
||
|
||
#[test]
|
||
fn type0_reg_run_writes_register_file() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// Type0 header: 2 dwords, base_index=0x100 → write_one=0 → count field = 1 (count-1)
|
||
let hdr = (1u32 << 16) | 0x100;
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
mem.write_u32(0x4000_0004, 0xDEAD_BEEF);
|
||
mem.write_u32(0x4000_0008, 0xCAFE_BABE);
|
||
gpu.extend_write_ptr(3);
|
||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||
assert_eq!(gpu.register_file.read(0x100), 0xDEAD_BEEF);
|
||
assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE);
|
||
}
|
||
|
||
#[test]
|
||
fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// WAIT_REG_MEM: wait until *0x40001000 == 0x42
|
||
// header
|
||
let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8);
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
// wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 2)
|
||
mem.write_u32(0x4000_0004, 0x12);
|
||
mem.write_u32(0x4000_0008, 0x4000_1000);
|
||
mem.write_u32(0x4000_000C, 0x42);
|
||
mem.write_u32(0x4000_0010, 0xFFFF_FFFF);
|
||
mem.write_u32(0x4000_0014, 0);
|
||
gpu.extend_write_ptr(6);
|
||
// First exec: poll addr reads 0 → blocked.
|
||
assert_eq!(gpu.execute_one(&mut mem), ExecOutcome::Blocked);
|
||
assert_eq!(gpu.ring.read_offset_dwords, 0, "read ptr must not advance while blocked");
|
||
// Make the wait satisfied.
|
||
mem.write_u32(0x4000_1000, 0x42);
|
||
match gpu.execute_one(&mut mem) {
|
||
ExecOutcome::Stepped { dwords_consumed } => {
|
||
// The WAIT_REG_MEM packet is 1 (header) + 5 (count) = 6 dwords.
|
||
assert_eq!(dwords_consumed, 6);
|
||
}
|
||
other => panic!("expected Stepped after wait satisfied, got {:?}", other),
|
||
}
|
||
assert_eq!(gpu.ring.read_offset_dwords, 6);
|
||
}
|
||
|
||
#[test]
|
||
fn mem_write_writes_all_payload_dwords() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// MEM_WRITE: count=3 → 1 header + 1 dst + 2 data
|
||
let hdr = (3u32 << 30) | ((3u32 - 1) << 16) | ((pm4::PM4_MEM_WRITE as u32) << 8);
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
mem.write_u32(0x4000_0004, 0x4000_1000); // dst
|
||
mem.write_u32(0x4000_0008, 0x1111_1111);
|
||
mem.write_u32(0x4000_000C, 0x2222_2222);
|
||
gpu.extend_write_ptr(4);
|
||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||
assert_eq!(mem.read_u32(0x4000_1000), 0x1111_1111);
|
||
assert_eq!(mem.read_u32(0x4000_1004), 0x2222_2222);
|
||
}
|
||
|
||
#[test]
|
||
fn mmio_write_to_cp_rb_wptr_reflects_into_ring() {
|
||
use std::sync::atomic::Ordering;
|
||
let mut gpu = GpuSystem::new();
|
||
let mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// Guest writes wptr=8 via MMIO.
|
||
gpu.mmio.cp_rb_wptr.store(8, Ordering::Relaxed);
|
||
// Before sync, ring has no pending work.
|
||
assert!(!gpu.is_ready(&mem));
|
||
gpu.sync_with_mmio();
|
||
assert_eq!(gpu.ring.write_offset_dwords, 8);
|
||
assert!(gpu.is_ready(&mem));
|
||
// After sync, rptr is mirrored back to mmio for the guest to read.
|
||
assert_eq!(gpu.mmio.cp_rb_rptr.load(Ordering::Relaxed), 0);
|
||
}
|
||
|
||
/// End-to-end: feed two DRAW_INDX_2 packets through `execute_one` and
|
||
/// verify the GPU system reports the expected `draws_seen` / `last_draw`
|
||
/// state that the UI's Xenos pipeline consumes. Acts as the "draw
|
||
/// dispatch integration" check mentioned in the P3 verification plan.
|
||
#[test]
|
||
fn successive_draws_accumulate_in_stats() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
let mk_draw = |addr: u32, vgt: u32, mem: &xenia_memory::GuestMemory| {
|
||
let hdr = (3u32 << 30) | ((1u32 - 1) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
|
||
mem.write_u32(addr, hdr);
|
||
mem.write_u32(addr + 4, vgt);
|
||
};
|
||
// Draw #1: TriangleList, 6 verts.
|
||
mk_draw(0x4000_0000, (6u32 << 16) | (2 << 6) | 4, &mut mem);
|
||
// Draw #2: TriangleStrip, 4 verts.
|
||
mk_draw(0x4000_0008, (4u32 << 16) | (2 << 6) | 6, &mut mem);
|
||
gpu.extend_write_ptr(4);
|
||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||
assert_eq!(gpu.stats.draws_seen, 2);
|
||
let ds = gpu.last_draw.expect("last_draw set");
|
||
assert_eq!(ds.primitive, crate::draw_state::PrimitiveType::TriangleStrip);
|
||
assert_eq!(ds.vertex_count, 4);
|
||
}
|
||
|
||
#[test]
|
||
fn draw_indx_2_captures_last_draw() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// DRAW_INDX_2 packet with 1 payload dword = vgt_draw_initiator:
|
||
// prim=4 (TriangleList), source=2 (auto), count=3 verts.
|
||
let vgt = (3u32 << 16) | (2 << 6) | 4;
|
||
let hdr = (3u32 << 30) | ((1u32 - 0) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
|
||
// count field in header = (total_dwords_after - 1); we have 1 payload = count=1 → encoded 0.
|
||
let hdr = (hdr & !0x3FFF_0000) | ((1u32 - 1) << 16);
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
mem.write_u32(0x4000_0004, vgt);
|
||
gpu.extend_write_ptr(2);
|
||
assert!(matches!(
|
||
gpu.execute_one(&mut mem),
|
||
ExecOutcome::Stepped { .. }
|
||
));
|
||
assert_eq!(gpu.stats.draws_seen, 1);
|
||
let ds = gpu.last_draw.expect("last_draw set");
|
||
assert_eq!(
|
||
ds.primitive,
|
||
crate::draw_state::PrimitiveType::TriangleList
|
||
);
|
||
assert_eq!(ds.vertex_count, 3);
|
||
let p = gpu.last_primitive.as_ref().expect("last_primitive set");
|
||
assert_eq!(p.topology, crate::primitive::HostTopology::TriangleList);
|
||
assert!(!p.rejected);
|
||
}
|
||
|
||
/// P3b M1: IM_LOAD_IMMEDIATE must set `active_vs_key` / `active_ps_key`
|
||
/// based on `shader_type`, and a subsequent DRAW_INDX must carry those
|
||
/// P8: shader-blob FIFO evicts the oldest non-active blob when the
|
||
/// cache crosses `SHADER_BLOB_CAP`. Active keys are protected.
|
||
#[test]
|
||
fn shader_blob_cap_evicts_oldest() {
|
||
let mut gpu = GpuSystem::new();
|
||
gpu.active_vs_key = Some(u32::MAX);
|
||
// Insert unique keys (starting at 1_000 to avoid colliding with
|
||
// the active-key sentinel) up to `CAP + 10`; every insert fires
|
||
// the eviction path once len > CAP.
|
||
gpu.insert_shader_blob(
|
||
u32::MAX,
|
||
ShaderBlob {
|
||
shader_type: 0,
|
||
dwords: vec![0xAA; 4],
|
||
},
|
||
);
|
||
let first_key = 1_000u32;
|
||
for k in first_key..(first_key + SHADER_BLOB_CAP as u32 + 10) {
|
||
gpu.insert_shader_blob(
|
||
k,
|
||
ShaderBlob {
|
||
shader_type: 0,
|
||
dwords: vec![k; 2],
|
||
},
|
||
);
|
||
}
|
||
assert!(gpu.shader_blobs.len() <= SHADER_BLOB_CAP);
|
||
// Active key (u32::MAX) must still be present.
|
||
assert!(gpu.shader_blobs.contains_key(&u32::MAX));
|
||
// Earliest non-active key must have been evicted (at least one of
|
||
// the first 10 we inserted is gone).
|
||
let evicted = (first_key..first_key + 10)
|
||
.filter(|k| !gpu.shader_blobs.contains_key(k))
|
||
.count();
|
||
assert!(
|
||
evicted > 0,
|
||
"expected at least one of the first 10 keys to be evicted, \
|
||
got shader_blobs.len() = {}",
|
||
gpu.shader_blobs.len()
|
||
);
|
||
}
|
||
|
||
/// `IM_LOAD_IMMEDIATE` uploads a vertex + pixel shader inline; draw
|
||
/// state must then carry whichever keys the executor minted. With the
|
||
/// content-hashed key scheme, vs and ps keys differ because their
|
||
/// microcode bytes differ — the concrete values are derived, so the
|
||
/// test just asserts both are non-zero and not equal.
|
||
#[test]
|
||
fn im_load_records_active_blob_and_draw_carries_it() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
|
||
// PM4_IM_LOAD_IMMEDIATE VS: 4 data dwords = shader_type + start_size
|
||
// + 2 code. Header count field = data_count - 1 = 3.
|
||
let hdr_vs = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
|
||
mem.write_u32(0x4000_0000, hdr_vs);
|
||
mem.write_u32(0x4000_0004, 0); // shader_type=0 (vertex)
|
||
mem.write_u32(0x4000_0008, 2); // start_size: size=2
|
||
mem.write_u32(0x4000_000C, 0xAAAA_AAAA);
|
||
mem.write_u32(0x4000_0010, 0xBBBB_BBBB);
|
||
|
||
// Second IM_LOAD_IMMEDIATE PS (shader_type=1); 5 dwords total.
|
||
let hdr_ps = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
|
||
mem.write_u32(0x4000_0014, hdr_ps);
|
||
mem.write_u32(0x4000_0018, 1); // shader_type=1 (pixel)
|
||
mem.write_u32(0x4000_001C, 2);
|
||
mem.write_u32(0x4000_0020, 0xCCCC_CCCC);
|
||
mem.write_u32(0x4000_0024, 0xDDDD_DDDD);
|
||
|
||
// DRAW_INDX_2: 1 data dword, count field = 0.
|
||
let vgt = (3u32 << 16) | (2 << 6) | 4;
|
||
let hdr_draw = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
|
||
mem.write_u32(0x4000_0028, hdr_draw);
|
||
mem.write_u32(0x4000_002C, vgt);
|
||
|
||
// Total dwords consumed by the 3 packets: 5 + 5 + 2 = 12.
|
||
gpu.extend_write_ptr(12);
|
||
// Drain all three packets.
|
||
for _ in 0..3 {
|
||
assert!(matches!(
|
||
gpu.execute_one(&mut mem),
|
||
ExecOutcome::Stepped { .. }
|
||
));
|
||
}
|
||
let vs_key = gpu.active_vs_key.expect("vs key set by IM_LOAD_IMMEDIATE");
|
||
let ps_key = gpu.active_ps_key.expect("ps key set by IM_LOAD_IMMEDIATE");
|
||
assert_ne!(vs_key, ps_key, "VS and PS keys must be distinct");
|
||
let ds = gpu.last_draw.expect("DRAW_INDX_2 captured");
|
||
assert_eq!(ds.vs_blob_key, Some(vs_key));
|
||
assert_eq!(ds.ps_blob_key, Some(ps_key));
|
||
}
|
||
|
||
/// Regression: before the content-hash keying, two distinct vertex
|
||
/// shaders uploaded via `IM_LOAD_IMMEDIATE` both mapped to key `0`
|
||
/// (the shader_type dword) and overwrote each other in `shader_blobs`.
|
||
/// With FNV-1a over the microcode, different blobs get different keys
|
||
/// and the cache retains both.
|
||
#[test]
|
||
fn im_load_immediate_distinct_microcode_does_not_collide() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
|
||
let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
|
||
|
||
// VS shader A.
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
mem.write_u32(0x4000_0004, 0); // shader_type = vertex
|
||
mem.write_u32(0x4000_0008, 2); // size = 2
|
||
mem.write_u32(0x4000_000C, 0x1111_1111);
|
||
mem.write_u32(0x4000_0010, 0x2222_2222);
|
||
|
||
// VS shader B — same stage, different microcode.
|
||
mem.write_u32(0x4000_0014, hdr);
|
||
mem.write_u32(0x4000_0018, 0);
|
||
mem.write_u32(0x4000_001C, 2);
|
||
mem.write_u32(0x4000_0020, 0x3333_3333);
|
||
mem.write_u32(0x4000_0024, 0x4444_4444);
|
||
|
||
gpu.extend_write_ptr(10);
|
||
for _ in 0..2 {
|
||
assert!(matches!(
|
||
gpu.execute_one(&mut mem),
|
||
ExecOutcome::Stepped { .. }
|
||
));
|
||
}
|
||
assert_eq!(
|
||
gpu.shader_blobs.len(),
|
||
2,
|
||
"two distinct VS shaders must not collide on the same cache key"
|
||
);
|
||
}
|
||
|
||
/// P4: `EVENT_WRITE` with initiator `15` (TILE_FLUSH) must route
|
||
/// through the resolve handler — captured `last_resolve` + incremented
|
||
/// `stats.resolves_total` proves the dispatch works.
|
||
#[test]
|
||
fn tile_flush_event_records_resolve() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// Pre-seed RB_COPY_DEST_BASE / DEST_PITCH / DEST_INFO so
|
||
// ResolveInfo captures recognisable values.
|
||
gpu.register_file
|
||
.write(draw_state::reg::RB_COPY_DEST_BASE, 0xDEAD_0000);
|
||
gpu.register_file.write(
|
||
draw_state::reg::RB_COPY_DEST_PITCH,
|
||
(720u32 << 16) | 1280u32,
|
||
);
|
||
// copy_dest_format=6 (k_8_8_8_8), copy_dest_endian=0.
|
||
gpu.register_file
|
||
.write(draw_state::reg::RB_COPY_DEST_INFO, 6u32 << 7);
|
||
gpu.register_file.write(
|
||
draw_state::reg::RB_COPY_CONTROL,
|
||
(1u32 << 20) /* copy_command=1 */ | (1u32 << 8), /* color_clear_enable */
|
||
);
|
||
|
||
// PM4_EVENT_WRITE: 1 data dword — the initiator.
|
||
let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_EVENT_WRITE as u32) << 8);
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
mem.write_u32(0x4000_0004, 15); // TILE_FLUSH
|
||
gpu.extend_write_ptr(2);
|
||
assert!(matches!(
|
||
gpu.execute_one(&mut mem),
|
||
ExecOutcome::Stepped { .. }
|
||
));
|
||
assert_eq!(gpu.stats.resolves_total, 1);
|
||
let info = gpu.last_resolve.expect("TILE_FLUSH captured resolve");
|
||
// `0xDEAD_0000 & 0x1FFF_FFFF = 0x1EAD_0000` — `dest_base` is now
|
||
// masked to the Xenon 29-bit physical range at decode time.
|
||
assert_eq!(info.dest_base, 0x1EAD_0000);
|
||
assert_eq!(info.dest_pitch_pixels, 1280);
|
||
assert_eq!(info.dest_height_pixels, 720);
|
||
assert_eq!(info.dest_format, 6);
|
||
assert_eq!(info.copy_command, 1);
|
||
assert!(info.color_clear_enable);
|
||
}
|
||
|
||
/// P4: DRAW_INDX* with a bound color target should populate
|
||
/// `rt_cache` so downstream stages (HUD, resolve) can look up the RT.
|
||
#[test]
|
||
fn draw_indx_populates_rt_cache() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
// color0 enabled + format=2 (k_8_8_8_8 or similar), base=0x10.
|
||
gpu.register_file.write(draw_state::reg::RB_MODECONTROL, 0x1);
|
||
gpu.register_file
|
||
.write(draw_state::reg::RB_COLOR_INFO_0, (2u32 << 16) | 0x10);
|
||
// Non-zero scissor so pitch32 calc is meaningful.
|
||
gpu.register_file.write(
|
||
draw_state::reg::PA_SC_WINDOW_SCISSOR_BR,
|
||
(720u32 << 16) | 1280u32,
|
||
);
|
||
let vgt = (3u32 << 16) | (2 << 6) | 4;
|
||
let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
mem.write_u32(0x4000_0004, vgt);
|
||
gpu.extend_write_ptr(2);
|
||
assert!(matches!(
|
||
gpu.execute_one(&mut mem),
|
||
ExecOutcome::Stepped { .. }
|
||
));
|
||
assert_eq!(gpu.rt_cache.len(), 1);
|
||
assert_eq!(gpu.stats.unique_render_targets, 1);
|
||
}
|
||
|
||
#[test]
|
||
fn xe_swap_records_notification() {
|
||
let mut gpu = GpuSystem::new();
|
||
let mut mem = build_mem();
|
||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||
let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
|
||
mem.write_u32(0x4000_0000, hdr);
|
||
mem.write_u32(0x4000_0004, pm4::SWAP_SIGNATURE);
|
||
mem.write_u32(0x4000_0008, 0xCAFE_0000);
|
||
mem.write_u32(0x4000_000C, 1280);
|
||
mem.write_u32(0x4000_0010, 720);
|
||
gpu.extend_write_ptr(5);
|
||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||
let swap = gpu.last_swap.unwrap();
|
||
assert_eq!(swap.frame_index, 1);
|
||
assert_eq!(swap.frontbuffer_phys, 0xCAFE_0000);
|
||
assert_eq!(swap.width, 1280);
|
||
assert_eq!(swap.height, 720);
|
||
assert_eq!(gpu.stats.swaps_seen, 1);
|
||
}
|
||
}
|