xenia-rs/crates/xenia-gpu/src/gpu_system.rs

//! Xenos GPU system: register file + primary ring buffer + PM4 executor.
//!
//! Design notes mirror the approved plan's P2 slice:
//!
//! - Runs on the same host thread as the CPU interpreter. Sequential access
//!   to `GuestMemory` — no locks, no sharing.
//! - One packet per [`GpuSystem::execute_one`] call. The scheduler calls this
//!   once per round when `is_ready` returns true. When the packet is a
//!   `WAIT_REG_MEM` whose condition isn't yet satisfied, we transition to
//!   [`GpuState::Blocked`] and the scheduler will re-poll us.
//! - Non-draw opcodes execute for real (register/memory writes, event
//!   writebacks). Draws (`DRAW_INDX*`, `XE_SWAP`) are classified but not
//!   rendered yet; they surface state (via spans + the swap hook) for later
//!   phases to consume.
//!
//! Opcode reference: `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h`.

use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::time::{Duration, Instant};

use xenia_memory::MemoryAccess;

use crate::draw_state::{self, DrawState};
use crate::pm4::{self, PacketKind};
use crate::primitive::{self, ProcessedPrimitive};
use crate::register_file::RegisterFile;
use crate::ring_view::RingBufferView;

/// The guest-virtual window that physical allocations are committed into.
/// `xenia-kernel`'s `heap_alloc` bumps its cursor through `0x4000_0000..=
/// 0x6FFF_FFFF` and commits the host backing for `MmAllocatePhysicalMemoryEx`
/// there, so this write-combine mirror is the canonical home of physical DRAM.
/// Keep in sync with `KernelState::heap_cursor`'s initial value.
pub const PHYSICAL_BACKING_BASE: u32 = 0x4000_0000;

/// Re-project a guest *physical* address — as handed to the Vd/GPU ABI and
/// embedded in PM4 pointers (`INDIRECT_BUFFER`, `WAIT_REG_MEM`-memory,
/// `MEM_WRITE`, `EVENT_WRITE*`, `IM_LOAD`, …) — onto the guest-virtual window
/// where its host backing is actually committed.
///
/// The Xbox 360 maps its 512 MB of physical DRAM into several virtual mirror
/// windows that differ only in cache policy: bare physical (`0x0xxxxxxx`),
/// write-combine (`0x4xxxxxxx`), and the cached `0xA/0xC/0xExxxxxxx` mirrors —
/// all aliasing `addr & 0x1FFF_FFFF`. On real hardware (and in xenia-canary
/// via overlapping `mmap`s) these are literally the same bytes.
///
/// Ours has a single flat `membase` and `MmAllocatePhysicalMemoryEx` commits
/// physical backing in the write-combine `0x4xxxxxxx` window. The guest then
/// masks its allocation base to *bare physical* before passing it to
/// `VdInitializeRingBuffer` / `VdEnableRingBufferRPtrWriteBack`, and PM4
/// pointers are likewise bare-physical. A flat `membase + phys` access
/// therefore hits a never-committed, zero-filled page instead of the committed
/// `0x4xxxxxxx` backing — so the GPU decoded zero PM4 headers and never ran
/// the real command stream.
///
/// Projecting any physical-mirror address back onto the `0x4xxxxxxx` window
/// lands on the page `heap_alloc` actually backed, regardless of which mirror
/// the guest used (idempotent for `0x4xxxxxxx` itself). The projection is
/// derived from `heap_alloc`'s placement, not a guess — if that window ever
/// moves, `PHYSICAL_BACKING_BASE` must move with it.
///
/// This is deliberately applied only at the GPU/Vd boundary (where addresses
/// arrive in their bare-physical form), NOT on the CPU's flat load/store path:
/// the guest CPU already accesses its allocations through the `0x4xxxxxxx`
/// base, and non-physical guest-virtual addresses (image `0x82xxxxxx`, stacks
/// `0x7xxxxxxx`) must stay flat.
#[inline]
pub fn physical_to_backing(addr: u32) -> u32 {
    match addr {
        0x0000_0000..=0x1FFF_FFFF
        | 0x4000_0000..=0x4FFF_FFFF
        | 0xA000_0000..=0xBFFF_FFFF
        | 0xC000_0000..=0xDFFF_FFFF
        | 0xE000_0000..=0xFFFF_FFFF => PHYSICAL_BACKING_BASE | (addr & 0x1FFF_FFFF),
        _ => addr,
    }
}

/// Max guest page-version over the `[base, base+len)` span, walking 4 KiB
/// pages via the `MemoryAccess` trait's `page_version`.
///
/// The concrete heap exposes an inherent `max_page_version(base, len)`, but
/// the draw handler only holds `&dyn MemoryAccess` (which carries the coarser
/// `page_version(addr)` accessor). This is byte-equivalent to
/// `heap::max_page_version` and stays a pure function of the per-page write
/// counters (no wall-clock), so texture-decode timing remains deterministic.
fn span_max_version(mem: &dyn MemoryAccess, base: u32, len: u32) -> u64 {
    const PAGE: u32 = 0x1000;
    let last = base.saturating_add(len.saturating_sub(1));
    let mut page = base & !(PAGE - 1);
    let last_page = last & !(PAGE - 1);
    let mut max = 0u64;
    loop {
        max = max.max(mem.page_version(page));
        if page >= last_page {
            break;
        }
        page = page.wrapping_add(PAGE);
    }
    max
}

/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
#[derive(Debug, Clone)]
pub struct ShaderBlob {
    pub shader_type: u8, // 0 = vertex, 1 = pixel
    pub dwords: Vec<u32>,
}

/// P8 — upper bound on the shader-blob cache (`GpuSystem.shader_blobs`).
/// Canary uses a similar FIFO ceiling; our number is deliberately generous
/// because blobs are small (a few KiB each at most) and misses force a
/// re-upload from the guest the next time `IM_LOAD*` fires. 256 is enough
/// for every shipping game's peak working set, per canary's traces.
pub const SHADER_BLOB_CAP: usize = 256;

/// PM4 `WAIT_REG_MEM` condition. Bits 0..=2 of the wait_info dword encode the
/// comparison (per `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h:685-696`).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WaitCmp {
    /// value < ref
    Less,
    /// value <= ref
    LessEq,
    /// value == ref
    Equal,
    /// value != ref
    NotEqual,
    /// value >= ref
    GreaterEq,
    /// value > ref
    Greater,
    /// Always — caller wants to sleep regardless (selector bit 7).
    Always,
    /// Never matches — `wait_info & 7 == 0` selects bit 0 of canary's
    /// selector word, which is always zero.
    Never,
}

impl WaitCmp {
    /// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`
    /// (`pm4_command_processor_implement.h:685-696`). Canary forms a selector
    /// `((value<ref)<<1) | ((value<=ref)<<2) | ((value==ref)<<3) |
    /// ((value!=ref)<<4) | ((value>=ref)<<5) | ((value>ref)<<6) | (1<<7)` and
    /// evaluates `(selector >> (wait_info & 7)) & 1`. So the index is the bit
    /// position: 1=Less, 2=LessEq, 3=Equal, 4=NotEqual, 5=GreaterEq,
    /// 6=Greater, 7=always-true, 0=never (bit 0 is always clear).
    ///
    /// GPUBUG: the prior mapping was off by one (it started at `0 => Less`),
    /// so `wait_info & 7 == 3` decoded as `NotEqual` instead of `Equal`. That
    /// inverted the standard CP coherency wait
    /// (`WAIT_REG_MEM COHER_STATUS_HOST, Equal 0`): the GPU parked forever on
    /// the first INDIRECT_BUFFER and never reached any draw.
    pub fn from_wait_info(wait_info: u32) -> Self {
        match wait_info & 0x7 {
            1 => WaitCmp::Less,
            2 => WaitCmp::LessEq,
            3 => WaitCmp::Equal,
            4 => WaitCmp::NotEqual,
            5 => WaitCmp::GreaterEq,
            6 => WaitCmp::Greater,
            7 => WaitCmp::Always,
            _ => WaitCmp::Never,
        }
    }

    pub fn evaluate(self, value: u32, reference: u32) -> bool {
        match self {
            WaitCmp::Less => value < reference,
            WaitCmp::LessEq => value <= reference,
            WaitCmp::Equal => value == reference,
            WaitCmp::NotEqual => value != reference,
            WaitCmp::GreaterEq => value >= reference,
            WaitCmp::Greater => value > reference,
            WaitCmp::Always => true,
            WaitCmp::Never => false,
        }
    }
}

/// Reason the GPU is currently parked. Mirrors the CPU-side scheduler
/// `BlockReason` shape. Currently only `WaitRegMem` — more to come in later
/// phases (interrupts, timestamp waits).
#[derive(Debug, Clone)]
pub enum GpuBlock {
    WaitRegMem {
        poll_addr: u32,
        is_memory: bool,
        reference: u32,
        mask: u32,
        cmp: WaitCmp,
    },
}

impl GpuBlock {
    /// Probe the wait condition. Returns `true` if the condition holds and
    /// the GPU should be unparked.
    pub fn is_satisfied(&self, mem: &dyn MemoryAccess, reg_file: &RegisterFile) -> bool {
        match self {
            GpuBlock::WaitRegMem {
                poll_addr,
                is_memory,
                reference,
                mask,
                cmp,
            } => {
                let value = if *is_memory {
                    mem.read_u32(*poll_addr)
                } else {
                    reg_file.read(*poll_addr)
                };
                cmp.evaluate(value & *mask, *reference)
            }
        }
    }
}

/// Public notification the CP emits when the guest presents a frame. The
/// kernel `vd_swap` path gates on this to push a `SwapInfo` to the host UI.
#[derive(Debug, Clone, Copy, Default)]
pub struct SwapNotification {
    pub frame_index: u64,
    pub frontbuffer_phys: u32,
    pub width: u32,
    pub height: u32,
}

/// GPU interrupt (fired by `PM4_INTERRUPT`). The kernel dispatches this to
/// the guest callback registered by `VdSetGraphicsInterruptCallback`.
#[derive(Debug, Clone, Copy)]
pub struct PendingInterrupt {
    pub source: InterruptSource,
    pub cpu_mask: u32,
}

#[derive(Debug, Clone, Copy)]
pub enum InterruptSource {
    CommandProcessor,
    Swap,
}

/// Per-run counters for observability.
#[derive(Debug, Clone, Default)]
pub struct GpuStats {
    pub packets_executed: u64,
    pub draws_seen: u64,
    pub swaps_seen: u64,
    pub interrupts_emitted: u64,
    pub wait_reg_mem_blocks: u64,
    pub indirect_buffer_jumps: u64,
    /// P4: count of EDRAM→memory resolves triggered by `TILE_FLUSH` events
    /// (event code 15). Non-zero means the game is committing rendered
    /// pixels to the frontbuffer / a texture.
    pub resolves_total: u64,
    /// Resolves whose byte copy path ran and wrote at least one sample to
    /// guest memory. Delta against `resolves_total` indicates how many
    /// resolves were skipped for an unsupported format / MSAA mode / 3D
    /// destination.
    pub resolves_copied_total: u64,
    /// Resolves that were skipped by [`crate::resolve::copy_to_memory`] due
    /// to an unsupported format path. Logged at `warn` so the reason is
    /// visible.
    pub resolves_skipped_total: u64,
    /// Total number of 32bpp samples written into guest memory across all
    /// successful resolves. Useful for sanity-checking that a big splash
    /// frame actually made it out (e.g. 1280×720 = 921_600 samples).
    pub resolve_samples_written: u64,
    /// P4: unique render-target keys seen (as managed by the internal
    /// `RenderTargetCache`). Useful HUD metric for multi-target workloads.
    pub unique_render_targets: u64,
}

/// Result of one packet step.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExecOutcome {
    /// Consumed one packet; GPU remains Ready.
    Stepped { dwords_consumed: u32 },
    /// Nothing to do right now.
    Idle,
    /// Parked on a sync primitive; `GpuSystem::pending_block` has details.
    Blocked,
}

/// Shader fetch constants: the game uploads them via `PM4_SET_CONSTANT` type=1
/// into a 256-dword region. Games then reference them by index when binding
/// textures / vertex buffers.
pub const CONST_BASE_ALU: u32 = 0x4000;
pub const CONST_BASE_FETCH: u32 = 0x4800;
pub const CONST_BASE_BOOL: u32 = 0x4900;
pub const CONST_BASE_LOOP: u32 = 0x4908;
pub const CONST_BASE_REGISTERS: u32 = 0x2000;

/// Atomic mailbox for the handful of GPU registers that CROSS the MMIO
/// boundary. Guests write into the `0x7FC80000` register aperture; those
/// writes run through [`crate::mmio_region`] and land in these atomics.
/// Inside `execute_one` / the scheduler's per-round GPU hook we sample them
/// to sync `ring.write_offset_dwords`, reflect progress back to the guest,
/// etc.
///
/// Only these three registers need atomic cross-thread access. Everything
/// else lives in [`GpuSystem::register_file`] which is CPU-thread-local.
#[derive(Debug, Clone)]
pub struct GpuMmio {
    /// `CP_RB_WPTR` — guest writes dword offset of the write pointer.
    pub cp_rb_wptr: Arc<AtomicU32>,
    /// `CP_RB_RPTR` — we mirror our internal `ring.read_offset_dwords` here
    /// so guests polling the register see progress.
    pub cp_rb_rptr: Arc<AtomicU32>,
    /// `CP_INT_STATUS` — bit set when an interrupt is pending.
    pub cp_int_status: Arc<AtomicU32>,
    /// `CP_INT_ACK` — guest clears the bit after handling.
    pub cp_int_ack: Arc<AtomicU32>,
    /// `D1MODE_VBLANK_VLINE_STATUS` (register `0x1951`, MMIO offset `0x6544`).
    /// Bit 0 = `VBLANK_INT_OCCURRED` — set by the GPU when vblank fires,
    /// cleared by the guest via write-1-to-clear. Sylpheed's vsync callback
    /// gates *all* its work on reading bit 0 as set: `lwz; rlwinm. r,r,0,31,31;
    /// bc 12,2,skip`. Without this bit toggling across vsyncs the callback
    /// always skips, so the PKEVENT that feeds the render dispatcher
    /// (user_data + 0x3B28) never gets signaled and the worker loops
    /// forever.
    pub d1mode_vblank_vline_status: Arc<AtomicU32>,
    /// M1.7 parker — set by producers (guest WPTR writes, shutdown) so
    /// the GPU worker thread does not park when work is pending. The
    /// worker swaps to `false` on entering its park decision and
    /// re-checks predicates; if a producer raced between the swap and
    /// the actual `park_timeout`, the producer's `unpark()` returns the
    /// park immediately via std's token semantics. Inline mode never
    /// reads this; the cost is one extra atomic store per WPTR write.
    pub wake_pending: Arc<AtomicBool>,
    /// Handle to the GPU worker thread, populated by `GpuWorker::run` on
    /// startup. The MMIO write callback for `CP_RB_WPTR` `unpark()`s it
    /// after every guest WPTR write so the worker proceeds without
    /// waiting for its `park_timeout`. `None` in inline mode (no worker
    /// to wake), in which case the unpark site is a one-mutex-lock
    /// no-op.
    pub worker_thread: Arc<std::sync::Mutex<Option<std::thread::Thread>>>,
}

impl GpuMmio {
    pub fn new() -> Self {
        Self {
            cp_rb_wptr: Arc::new(AtomicU32::new(0)),
            cp_rb_rptr: Arc::new(AtomicU32::new(0)),
            cp_int_status: Arc::new(AtomicU32::new(0)),
            cp_int_ack: Arc::new(AtomicU32::new(0)),
            d1mode_vblank_vline_status: Arc::new(AtomicU32::new(0)),
            wake_pending: Arc::new(AtomicBool::new(false)),
            worker_thread: Arc::new(std::sync::Mutex::new(None)),
        }
    }
}

impl Default for GpuMmio {
    fn default() -> Self {
        Self::new()
    }
}

/// Live GPU system. One instance per `KernelState`.
pub struct GpuSystem {
    pub register_file: RegisterFile,
    pub ring: RingBufferView,
    /// Stack of saved rings for `PM4_INDIRECT_BUFFER` nesting. The active
    /// ring is always `ring`; when an IB packet arrives, we push `ring` onto
    /// this stack and replace `ring` with the IB view. On IB completion
    /// (read pointer catches up to size), we pop.
    ib_stack: Vec<RingBufferView>,
    /// Cached shader blobs keyed by the raw CP register address that loaded them.
    pub shader_blobs: HashMap<u32, ShaderBlob>,
    /// P8 — FIFO of blob keys for bounded eviction. On `IM_LOAD*` the
    /// new key is pushed to the back; if the blob count exceeds
    /// [`SHADER_BLOB_CAP`], the front is popped and removed from
    /// `shader_blobs`. Prevents long-running guests from growing the
    /// cache without bound. The two *active* keys (`active_vs_key` +
    /// `active_ps_key`) are never evicted — safeguard in `evict_oldest`.
    pub shader_blob_order: std::collections::VecDeque<u32>,
    /// Monotonic frame counter (bumped on `PM4_XE_SWAP`).
    pub swap_counter: u64,
    /// Most recent swap notification; the kernel polls this after `execute_one`
    /// to decide whether to push a UI swap event.
    pub last_swap: Option<SwapNotification>,
    /// Queue of interrupts not yet delivered to the guest. Private so that
    /// callers go through [`Self::take_pending_interrupts`] — M1 step 6
    /// then redirects this drain into a `crossbeam_channel::Sender` without
    /// re-touching every call site.
    pending_interrupts: Vec<PendingInterrupt>,
    /// Current stall reason, if any.
    pub pending_block: Option<GpuBlock>,
    pub stats: GpuStats,
    /// For the 64-bit bin mask/select we split hi/lo writes.
    pub bin_mask: u64,
    pub bin_select: u64,
    /// Shared-atomic mailbox for the `0x7FC80000` MMIO aperture. Cloned into
    /// the [`crate::mmio_region`] callbacks; clone-ownership keeps the bus
    /// side and the executor side in sync without locks.
    pub mmio: GpuMmio,
    /// Most recent draw state extracted at a `PM4_DRAW_INDX*` packet. The
    /// uber-shader pipeline in P3+ reads this to build its wgpu draw call.
    pub last_draw: Option<DrawState>,
    /// Most recent processed primitive — index rewrite + host topology
    /// decision. Separate from `last_draw` because its `rewritten_indices`
    /// may be large and callers may want to drop it after consumption.
    pub last_primitive: Option<ProcessedPrimitive>,
    /// Key in `shader_blobs` of the currently-active vertex shader. Set by
    /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kVertex` and read at
    /// `PM4_DRAW_INDX*` time so the host side can upload the matching
    /// microcode bytes before dispatching.
    pub active_vs_key: Option<u32>,
    /// Key in `shader_blobs` of the currently-active pixel shader. Set by
    /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kPixel`.
    pub active_ps_key: Option<u32>,
    /// P4: EDRAM tile-ownership bookkeeping + per-key RT descriptors. Updated
    /// at each `PM4_DRAW_INDX*` (`bind` + `claim_tiles`) and queried by
    /// `TILE_FLUSH` event handling to decide resolve sources.
    pub rt_cache: crate::render_target_cache::RenderTargetCache,
    /// P4: most recent `ResolveInfo` observed when `TILE_FLUSH` fired. The UI
    /// bridge surfaces this in the HUD so users can tell when a game is
    /// resolving to the frontbuffer versus an off-screen target.
    pub last_resolve: Option<crate::draw_state::ResolveInfo>,
    /// P5: CPU-side decoded-texture cache (shared across draws within a
    /// frame; trimmed implicitly by insertion). `ensure_cached` hits this
    /// on every texture-fetch resolution; the UI thread sees the decoded
    /// bytes via `UiBridge::publish_texture`.
    pub texture_cache: crate::texture_cache::TextureCache,
    /// P5b: textures decoded at the most recent `PM4_DRAW_INDX*`, keyed off
    /// the *active* pixel shader's real `tfetch` fetch-constant slots (not a
    /// hardcoded slot). `vd_swap` publishes the first of these to the UI so
    /// the replay binds the texture the draw actually samples. Cleared and
    /// repopulated each draw; empty when the active PS issues no `tfetch`.
    pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, Vec<u8>)>,
    /// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
    /// (future) host-render-target readback; read by the resolve byte-copy
    /// path that writes tiled pixels into guest memory. Allocated once at
    /// `GpuSystem::new` and lives for the whole GPU lifetime — no
    /// per-frame churn.
    pub edram: crate::edram::ShadowEdram,
    /// UI-only: when `Some`, every `PM4_DRAW_INDX*` appends a
    /// [`crate::draw_capture::DrawCapture`] here so the host UI can replay the
    /// real guest geometry. `None` in headless/deterministic mode — the
    /// `--gpu-inline` golden never enables this, so capture is entirely inert
    /// for `check`. Drained (taken) by `vd_swap` at each present.
    pub frame_captures: Option<Vec<crate::draw_capture::DrawCapture>>,
}

impl GpuSystem {
    pub fn new() -> Self {
        Self {
            register_file: RegisterFile::new(),
            ring: RingBufferView::new(),
            ib_stack: Vec::new(),
            shader_blobs: HashMap::new(),
            shader_blob_order: std::collections::VecDeque::with_capacity(SHADER_BLOB_CAP + 1),
            swap_counter: 0,
            last_swap: None,
            pending_interrupts: Vec::new(),
            pending_block: None,
            stats: GpuStats::default(),
            bin_mask: 0,
            bin_select: 0,
            mmio: GpuMmio::new(),
            last_draw: None,
            last_primitive: None,
            active_vs_key: None,
            active_ps_key: None,
            rt_cache: crate::render_target_cache::RenderTargetCache::new(),
            last_resolve: None,
            texture_cache: crate::texture_cache::TextureCache::new(),
            last_draw_textures: Vec::new(),
            edram: crate::edram::ShadowEdram::new(),
            frame_captures: None,
        }
    }

    /// Enable per-draw geometry capture for the host UI. Inert (and never
    /// called) in headless/deterministic mode. Idempotent.
    pub fn enable_frame_capture(&mut self) {
        if self.frame_captures.is_none() {
            self.frame_captures = Some(Vec::new());
        }
    }

    /// P8 — insert a shader blob + bump the FIFO so long-running games
    /// don't grow `shader_blobs` without bound. Caps at [`SHADER_BLOB_CAP`].
    /// Never evicts the currently-active VS/PS blobs (if they ended up at
    /// the front of the queue, we skip past them).
    fn insert_shader_blob(&mut self, key: u32, blob: ShaderBlob) {
        let already_present = self.shader_blobs.contains_key(&key);
        self.shader_blobs.insert(key, blob);
        if !already_present {
            self.shader_blob_order.push_back(key);
            metrics::counter!("gpu.shader.blob_seen").increment(1);
        }
        while self.shader_blobs.len() > SHADER_BLOB_CAP {
            // Pop the oldest key that isn't one of the active ones.
            let mut evicted = None;
            for _ in 0..self.shader_blob_order.len() {
                if let Some(candidate) = self.shader_blob_order.pop_front() {
                    if Some(candidate) == self.active_vs_key
                        || Some(candidate) == self.active_ps_key
                    {
                        self.shader_blob_order.push_back(candidate);
                        continue;
                    }
                    self.shader_blobs.remove(&candidate);
                    evicted = Some(candidate);
                    break;
                }
            }
            if evicted.is_some() {
                metrics::counter!("gpu.shader.blob_evicted").increment(1);
            } else {
                // All remaining blobs are active — can't evict, stop.
                break;
            }
        }
    }

    /// P4: event dispatch helper called by every `PM4_EVENT_WRITE*` variant.
    /// `event_code` is the low 6 bits of the initiator word (see canary's
    /// `xenos::Event` enum — code 15 is `TILE_FLUSH`, the resolve trigger).
    ///
    /// Handle a `PM4_EVENT_WRITE*` initiator. For `TILE_FLUSH` (event 15)
    /// we decode the live `RB_*` register state into a [`ResolveInfo`],
    /// paint any clear values into the shadow EDRAM, and then copy bytes
    /// from the source render target into guest memory at
    /// `RB_COPY_DEST_BASE`. That last step is what `VdSwap` needs to see
    /// non-empty front-buffer pixels — see `memory/project_xenia_rs_edram
    /// _resolve_gap.md` for the history of this path.
    fn handle_event_initiator(&mut self, event_code: u32, mem: &dyn MemoryAccess) {
        const EVENT_TILE_FLUSH: u32 = 15;
        if event_code != EVENT_TILE_FLUSH {
            return;
        }
        let info = draw_state::ResolveInfo::from_register_file_and_memory(
            &self.register_file,
            mem,
        );
        self.stats.resolves_total += 1;
        metrics::counter!(
            "gpu.resolve",
            "src" => format!("{}", info.copy_src_select),
            "fmt" => format!("{}", info.dest_format),
            "cmd" => format!("{}", info.copy_command),
        )
        .increment(1);
        tracing::info!(
            src = info.copy_src_select,
            dst_base = format_args!("{:#010x}", info.dest_base),
            w = info.coords.width,
            h = info.coords.height,
            pitch = info.dest_pitch_pixels,
            fmt = info.dest_format,
            endian = info.dest_endian,
            clear_color = info.color_clear_enable,
            clear_depth = info.depth_clear_enable,
            "gpu: TILE_FLUSH resolve"
        );

        // Paint clear values into the shadow EDRAM at the source tile
        // range *before* the copy. Games often issue a clear-then-resolve
        // as a single TILE_FLUSH: the EDRAM is filled with `RB_COLOR_CLEAR`
        // by the clear part, and that's what the copy part reads.
        //
        // Sample coordinates are pixel coordinates scaled up by
        // `sample_count_log2_x/y` — for 1x MSAA (Sylpheed) this is the
        // identity.
        if info.color_clear_enable
            && let draw_state::ResolveSource::Color(_) = info.source
            && info.surface_pitch_tiles > 0
        {
            let sx = info.coords.x0 << info.coords.sample_count_log2_x;
            let sy = info.coords.y0 << info.coords.sample_count_log2_y;
            let sw = info.coords.width << info.coords.sample_count_log2_x;
            let sh = info.coords.height << info.coords.sample_count_log2_y;
            // 64bpp clears use both `RB_COLOR_CLEAR_LO` (low 32 bits) and
            // `RB_COLOR_CLEAR` (high 32 bits) per Canary `draw_util.cc:1302-1303`.
            // 32bpp clears ignore the lo word entirely.
            if info.source_is_64bpp {
                self.edram.fill_rect_64bpp(
                    info.source_base_tiles,
                    info.surface_pitch_tiles,
                    sx,
                    sy,
                    sw,
                    sh,
                    info.color_clear_value_lo,
                    info.color_clear_value,
                );
            } else {
                self.edram.fill_rect_32bpp(
                    info.source_base_tiles,
                    info.surface_pitch_tiles,
                    sx,
                    sy,
                    sw,
                    sh,
                    info.color_clear_value,
                );
            }
        }
        if info.depth_clear_enable && info.surface_pitch_tiles > 0 {
            let sx = info.coords.x0 << info.coords.sample_count_log2_x;
            let sy = info.coords.y0 << info.coords.sample_count_log2_y;
            let sw = info.coords.width << info.coords.sample_count_log2_x;
            let sh = info.coords.height << info.coords.sample_count_log2_y;
            // Depth tiles live at RB_DEPTH_INFO.depth_base regardless of
            // which source this resolve selects.
            let rb_depth_info = self.register_file.read(draw_state::reg::RB_DEPTH_INFO);
            let depth_base = (rb_depth_info & 0xFFF) as u16;
            self.edram.fill_rect_32bpp(
                depth_base,
                info.surface_pitch_tiles,
                sx,
                sy,
                sw,
                sh,
                info.depth_clear_value,
            );
        }

        // Byte copy into guest memory.
        let stats = crate::resolve::copy_to_memory(&info, &self.edram, mem);
        if stats.supported && stats.samples_written > 0 {
            self.stats.resolves_copied_total += 1;
            self.stats.resolve_samples_written += stats.samples_written as u64;
        } else if !stats.supported {
            self.stats.resolves_skipped_total += 1;
        }

        self.last_resolve = Some(info);
    }

    /// Sync state with the MMIO atomic mailbox. Call once at the top of the
    /// per-round GPU hook: the guest may have advanced `CP_RB_WPTR` since
    /// we last ran, and we in turn reflect our read-pointer back to the
    /// mirror register so the guest sees progress.
    ///
    /// GPUBUG-006: under `--parallel`, the producer (the guest CP_RB_WPTR
    /// MMIO write) uses `Release` to publish prior ring-memory writes;
    /// the consumer here must `Acquire`-load to pair correctly. With
    /// Relaxed-on-load, ring-memory writes that the guest performed
    /// before bumping WPTR could be reordered past our subsequent reads
    /// — leading to garbage PM4 packet contents. The producer side at
    /// `mmio_region.rs:78` already uses Release; the consumer's Relaxed
    /// was the missing half. Symmetrically, the RPTR mirror store
    /// publishes our read progress to the guest and benefits from a
    /// Release.
    pub fn sync_with_mmio(&mut self) {
        let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire);
        // CP_RB_WPTR governs ONLY the primary ring. While an indirect buffer
        // is executing, the active `self.ring` is a fixed linear sub-stream
        // and the primary ring is saved at the bottom of the IB stack —
        // applying the (primary) write pointer to the IB would corrupt its
        // extent (e.g. `wptr % ib_size`) and strand the GPU mid-buffer.
        let primary = self.ib_stack.first_mut().unwrap_or(&mut self.ring);
        if wptr_dwords != primary.write_offset_dwords && primary.size_dwords != 0 {
            primary.write_offset_dwords = wptr_dwords % primary.size_dwords;
        }
        let primary_rptr = primary.read_offset_dwords;
        // Mirror the *primary* read pointer (Release pairs with any guest-side
        // Acquire-load of CP_RB_RPTR for ring writeback bookkeeping).
        self.mmio
            .cp_rb_rptr
            .store(primary_rptr, Ordering::Release);
    }

    /// True iff `execute_one` is expected to make progress without blocking.
    pub fn is_ready(&self, mem: &dyn MemoryAccess) -> bool {
        if let Some(block) = &self.pending_block {
            return block.is_satisfied(mem, &self.register_file);
        }
        // Pending work may be in the active ring OR in a saved caller ring
        // further down the IB stack (an exhausted IB still needs `execute_one`
        // to pop back and resume the primary ring, whose WPTR may have since
        // advanced).
        self.ring.has_pending() || self.ib_stack.iter().any(|r| r.has_pending())
    }

    /// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when
    /// there's nothing to do, [`ExecOutcome::Blocked`] if a sync primitive
    /// stalls the GPU, otherwise [`ExecOutcome::Stepped`] with the number of
    /// dwords consumed (counting the header).
    pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome {
        // 0) If currently parked, probe the condition and either wake up or stay blocked.
        if let Some(block) = self.pending_block.clone() {
            // Re-service the CP coherency handshake on each probe so a
            // COHER_STATUS_HOST wait can clear (canary does this in its WAIT
            // loop body, not just at entry).
            if let GpuBlock::WaitRegMem { poll_addr, is_memory: false, .. } = &block {
                self.make_coherent(*poll_addr);
            }
            if block.is_satisfied(mem, &self.register_file) {
                tracing::debug!(?block, "gpu: wait satisfied — resuming");
                self.pending_block = None;
            } else {
                return ExecOutcome::Blocked;
            }
        }
        if !self.ring.has_pending() {
            // End of current ring. If we were inside an indirect buffer, pop
            // and resume the caller.
            if let Some(caller) = self.ib_stack.pop() {
                self.ring = caller;
                if self.ring.has_pending() {
                    return self.execute_one(mem);
                }
            }
            return ExecOutcome::Idle;
        }
        let header_addr = self.ring.addr_at_offset(0).unwrap();
        let header_word = mem.read_u32(header_addr);
        let packet = pm4::decode(header_word);
        tracing::trace!(
            header = format_args!("{header_word:#010x}"),
            addr = format_args!("{header_addr:#010x}"),
            ?packet.kind,
            "gpu: packet"
        );
        let consumed = match packet.kind {
            PacketKind::Type0 { base_index, count, write_one } => {
                self.handle_type0(mem, base_index, count, write_one, packet.total_dwords)
            }
            PacketKind::Type1 { reg_index_1, reg_index_2 } => {
                self.handle_type1(mem, reg_index_1, reg_index_2)
            }
            PacketKind::Type2 => 1,
            PacketKind::Type3 {
                opcode,
                count,
                predicated,
            } => match self.handle_type3(mem, opcode, count, predicated, packet.total_dwords) {
                Type3Result::Consumed(n) => n,
                Type3Result::Blocked { rewind_to_header } => {
                    // Re-park on this packet so the resume path re-reads it.
                    if rewind_to_header {
                        // We haven't moved read ptr yet, so this is a no-op —
                        // documented to keep intent explicit.
                    }
                    return ExecOutcome::Blocked;
                }
            },
        };
        self.ring.advance_read(consumed);
        self.writeback_read_ptr(mem);
        self.stats.packets_executed += 1;
        ExecOutcome::Stepped {
            dwords_consumed: consumed,
        }
    }

    /// First-Pixels M2b — kernel-side commit point for `VdSwap`. Prior to
    /// M2b the kernel's [`vd_swap`] wrote a synthetic `PM4_XE_SWAP` packet
    /// at the guest-provided `buffer_ptr` + advanced our ring `wptr` by 64
    /// dwords, expecting the drain to pick it up. That mechanism misaligned:
    /// the drain reads from `ring.base + rptr * 4` forward, not from the
    /// game's out-of-band `buffer_ptr`. 512 ring packets executed through
    /// 1 B guest instructions but `swaps_seen` stayed at 0.
    ///
    /// `VdSwap` is the kernel's commit point by definition — we don't need
    /// to launder the event through the ring. Call this directly from the
    /// kernel-side handler; the `PM4_XE_SWAP` opcode path still works for
    /// the (rare) case of a game that emits the packet through its own ring
    /// writes.
    pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) {
        self.stats.swaps_seen += 1;
        self.swap_counter = self.swap_counter.wrapping_add(1);
        self.last_swap = Some(SwapNotification {
            frame_index: self.swap_counter,
            frontbuffer_phys,
            width,
            height,
        });
        // iterate-2T: do NOT raise a CP swap-complete interrupt here. Canary's
        // `VdSwap`/PM4_XE_SWAP path raises no interrupt; swap-complete CP
        // interrupts come ONLY from in-stream `PM4_INTERRUPT` packets, which
        // are naturally ordered after D3D has armed the swap-callback slot.
        // Synthesizing one out of band (as we did pre-2T) delivered a CP
        // interrupt while the slot still held the `0xBADF00D` placeholder,
        // tripping the graphics ISR's "Unanticipated CPU_INTERRUPT" assert.
        tracing::info!(
            frame = self.swap_counter,
            fb = format_args!("{frontbuffer_phys:#010x}"),
            width,
            height,
            "gpu: XE_SWAP (kernel-direct)"
        );
    }

    /// Called by `VdInitializeRingBuffer` to give us the primary ring.
    pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
        // Canary `CommandProcessor::InitializeRingBuffer` (command_processor.cc:
        // 436): `primary_buffer_size_ = 1 << (size_log2 + 3)` *bytes*. The
        // `VdInitializeRingBuffer` `r4` argument is log2(size-in-quadwords),
        // so the byte size is `1 << (size_log2 + 3)` (× 8 bytes/quadword), i.e.
        // `1 << (size_log2 + 1)` dwords. (Sylpheed passes size_log2=12 →
        // 32768 bytes / 8192 dwords; the previous `1 << size_log2` undersized
        // the ring 8× and desynced WPTR wrap math from the guest.)
        let size_bytes = 1u32 << size_log2.saturating_add(3).min(31);
        // The guest hands us a bare *physical* ring base; project it onto the
        // committed backing window so ring reads hit real PM4 packets (see
        // `physical_to_backing`).
        let base = physical_to_backing(base);
        self.ring.base = base;
        self.ring.size_dwords = size_bytes / 4;
        self.ring.indirect = false;
        self.ring.read_offset_dwords = 0;
        // `write_offset` is driven by the guest — start at 0 so the ring
        // appears empty until MMIO writes advance it.
        self.ring.write_offset_dwords = 0;
        tracing::info!(
            base = format_args!("{base:#010x}"),
            size_bytes,
            size_dwords = self.ring.size_dwords,
            "gpu: ring initialized"
        );
    }

    /// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest
    /// expects us to mirror `read_offset_dwords`.
    pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) {
        // The guest registers a bare *physical* writeback address and polls
        // the same allocation through its `0x4xxxxxxx` base; project so our
        // RPtr store lands on the page the guest actually reads.
        let addr = physical_to_backing(addr);
        self.ring.rptr_writeback_addr = addr;
        self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31);
        tracing::info!(
            addr = format_args!("{addr:#010x}"),
            block_dwords = self.ring.rptr_writeback_block_dwords,
            "gpu: rptr writeback enabled"
        );
    }

    /// Drain the pending-interrupt queue. The kernel calls this once per
    /// scheduler round and queues each entry into `interrupts.queue_interrupt`.
    ///
    /// M1 step 6 swaps the `Vec`-backed implementation for a
    /// `crossbeam_channel::Sender<PendingInterrupt>`. Routing every external
    /// reader through this single accessor in step 2 means that swap is a
    /// localized change — no call site changes.
    ///
    /// Returns the previously-queued interrupts and leaves the internal queue
    /// empty. Cheap (`Vec::take`); no allocation when the queue is already
    /// empty.
    pub fn take_pending_interrupts(&mut self) -> Vec<PendingInterrupt> {
        std::mem::take(&mut self.pending_interrupts)
    }

    /// True when the pending-interrupt queue has at least one entry. Used
    /// by callers that want to short-circuit an empty drain (saving the
    /// `Vec::new()` allocation that `take` would otherwise force on every
    /// scheduler round).
    pub fn has_pending_interrupts(&self) -> bool {
        !self.pending_interrupts.is_empty()
    }

    /// Extend the logical write pointer by `dwords` (cumulative). `VdSwap`
    /// reserves 64 dwords then calls this; MMIO writes to `CP_RB_WPTR` will
    /// do the same in P2+.
    pub fn extend_write_ptr(&mut self, new_wptr_dwords: u32) {
        if self.ring.size_dwords == 0 {
            return;
        }
        self.ring.write_offset_dwords = new_wptr_dwords % self.ring.size_dwords;
    }

    /// Write the current read pointer back to the guest-registered
    /// address. M1.8 uses the fenced variant: when the GPU runs on its
    /// own host thread, the CPU can poll this RPTR mirror to learn how
    /// far the GPU has consumed the ring; the Release fence ensures any
    /// upstream packet effects (memory writes, register file updates
    /// the guest reads via subsequent MMIO) happen-before the
    /// CPU-visible RPTR bump.
    /// Service a CP coherency request, mirroring canary's
    /// `CommandProcessor::MakeCoherent` (`command_processor.cc:801-838`).
    ///
    /// The guest requests a vertex/texture-cache flush by writing
    /// `COHER_STATUS_HOST` with its status bit (bit 31) set, then spins on a
    /// `WAIT_REG_MEM COHER_STATUS_HOST, Equal 0`. We have no host cache to
    /// flush (memory is shared, coherency is implicit), so completing the
    /// request is simply clearing the register — which lets the wait satisfy.
    /// No-op unless `poll_addr` is `COHER_STATUS_HOST` and its status bit is
    /// set, so it is safe to call on every coherency-register WAIT probe.
    fn make_coherent(&mut self, poll_addr: u32) {
        if poll_addr != reg::COHER_STATUS_HOST {
            return;
        }
        let status = self.register_file.read(reg::COHER_STATUS_HOST);
        if status & 0x8000_0000 != 0 {
            self.register_file.write(reg::COHER_STATUS_HOST, 0);
        }
    }

    /// CP scratch-register memory writeback, mirroring canary's
    /// `CommandProcessor::HandleSpecialRegisterWrite`
    /// (`command_processor.cc:545-552`). Every register write runs through
    /// here; when the target is one of the eight `SCRATCH_REG{n}`
    /// (`0x0578..=0x057F`) **and** the matching bit in `SCRATCH_UMSK` is set,
    /// the value is also written (big-endian, as `mem.write_u32` already
    /// stores) to `SCRATCH_ADDR + n*4` in guest physical memory.
    ///
    /// Sylpheed arms its CP swap-complete interrupt callback through this
    /// path: it programs `SCRATCH_ADDR` to the GPU command-block descriptor
    /// (`[gfx+10772]`, runtime `0x0b1d5000`), `SCRATCH_UMSK` bit 4, then a
    /// Type-0 write of the callback PC `0x824ce2b8` into `SCRATCH_REG4`
    /// (`0x057C`). The writeback lands it at descriptor+16 (`0x4b1d5010`),
    /// which the graphics ISR (`sub_824BE9A0`) reads via `[[gfx+10772]+16]`
    /// and `bcctrl`s to fire the swap-complete callback. Without this
    /// writeback the slot stayed NULL, the ISR skipped the callback, the
    /// swap counter never advanced, and the title's per-frame manager
    /// re-fired once then plateaued.
    fn scratch_register_writeback(&self, mem: &dyn MemoryAccess, index: u32, value: u32) {
        if !(reg::SCRATCH_REG0..=reg::SCRATCH_REG7).contains(&index) {
            return;
        }
        let scratch_reg = index - reg::SCRATCH_REG0;
        let umsk = self.register_file.read(reg::SCRATCH_UMSK);
        if (1u32 << scratch_reg) & umsk == 0 {
            return;
        }
        let scratch_addr = self.register_file.read(reg::SCRATCH_ADDR);
        let mem_addr = physical_to_backing(scratch_addr.wrapping_add(scratch_reg * 4));
        mem.write_u32(mem_addr, value);
    }

    fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) {
        if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() {
            mem.write_u32_fence(
                self.ring.rptr_writeback_addr,
                self.ring.read_offset_dwords,
            );
        }
    }

    // ── Type-0/1 handlers ─────────────────────────────────────────────────

    fn handle_type0(
        &mut self,
        mem: &dyn MemoryAccess,
        base_index: u32,
        count: u32,
        write_one: bool,
        total_dwords: u32,
    ) -> u32 {
        for i in 0..count {
            let dword_addr = self.ring.addr_at_offset(1 + i).unwrap();
            let value = mem.read_u32(dword_addr);
            let target = if write_one { base_index } else { base_index + i };
            self.register_file.write(target, value);
            self.scratch_register_writeback(mem, target, value);
        }
        tracing::trace!(
            base = format_args!("{base_index:#x}"),
            count,
            write_one,
            "gpu: Type0 reg write run"
        );
        total_dwords
    }

    fn handle_type1(
        &mut self,
        mem: &dyn MemoryAccess,
        reg_index_1: u32,
        reg_index_2: u32,
    ) -> u32 {
        let a_addr = self.ring.addr_at_offset(1).unwrap();
        let b_addr = self.ring.addr_at_offset(2).unwrap();
        let a = mem.read_u32(a_addr);
        let b = mem.read_u32(b_addr);
        self.register_file.write(reg_index_1, a);
        self.register_file.write(reg_index_2, b);
        self.scratch_register_writeback(mem, reg_index_1, a);
        self.scratch_register_writeback(mem, reg_index_2, b);
        tracing::trace!(
            r1 = format_args!("{reg_index_1:#x}"),
            r2 = format_args!("{reg_index_2:#x}"),
            "gpu: Type1 dual reg write"
        );
        3
    }

    // ── Type-3 dispatch ───────────────────────────────────────────────────

    fn handle_type3(
        &mut self,
        mem: &dyn MemoryAccess,
        opcode: u8,
        count: u32,
        predicated: bool,
        total_dwords: u32,
    ) -> Type3Result {
        metrics::counter!("gpu.packet", "opcode" => pm4::type3_opcode_name(opcode)).increment(1);
        tracing::trace!(
            opcode = format_args!("{opcode:#x}"),
            name = pm4::type3_opcode_name(opcode),
            count,
            predicated,
            "gpu: Type3"
        );
        // If predicated and the bin mask/select combo evaluates to "skip",
        // consume the whole packet (including data dwords) and move on. We
        // don't emulate binning so bin_mask & bin_select is always 0 → we
        // keep predicated packets in simplest form: execute them anyway. Most
        // games don't use binning on Xenos. Observed in canary:
        // `pm4_command_processor_implement.h:440-460`.
        let _ = predicated;

        match opcode {
            pm4::PM4_NOP
            | pm4::PM4_WAIT_FOR_IDLE
            | pm4::PM4_CONTEXT_UPDATE
            | pm4::PM4_INVALIDATE_STATE
            | pm4::PM4_ME_INIT
            | pm4::PM4_VIZ_QUERY
            | pm4::PM4_SET_SHADER_BASES => {
                // Classify-and-skip. State side effects (if any) are deferred.
            }
            pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => {
                self.stats.indirect_buffer_jumps += 1;
                // The IB pointer is a guest *physical* address — project it
                // onto the committed backing window (see `physical_to_backing`).
                let ib_ptr = physical_to_backing(self.read_payload(mem, 1));
                let ib_size = self.read_payload(mem, 2);
                // Advance past the IB header + payload before recursing so
                // the return location is correct.
                self.ring.advance_read(total_dwords);
                self.writeback_read_ptr(mem);
                // Push current ring, switch to IB view.
                let caller = self.ring;
                self.ib_stack.push(caller);
                self.ring = RingBufferView {
                    base: ib_ptr & !3,
                    size_dwords: ib_size,
                    read_offset_dwords: 0,
                    write_offset_dwords: ib_size, // IB is fully-written at jump time
                    rptr_writeback_addr: 0,
                    rptr_writeback_block_dwords: 0,
                    // Linear sub-stream: drain [0, ib_size) then pop. Never
                    // wraps, and `sync_with_mmio`'s CP_RB_WPTR must not touch
                    // it (canary executes IBs through a separate reader).
                    indirect: true,
                };
                tracing::debug!(
                    ib_ptr = format_args!("{ib_ptr:#010x}"),
                    ib_size,
                    "gpu: jump to indirect buffer"
                );
                return Type3Result::Consumed(0); // we already advanced
            }
            pm4::PM4_WAIT_REG_MEM => {
                // Canary layout (pm4_command_processor_implement.h:699-755):
                //   payload[0] = wait_info (bit 4 = is_memory, bits 2:0 = cmp)
                //   payload[1] = poll address (register idx OR memory addr; bottom 2 bits = endian for memory)
                //   payload[2] = ref value
                //   payload[3] = mask
                //   payload[4] = wait (sleep hint, ignored)
                let wait_info = self.read_payload(mem, 1);
                let poll_addr_raw = self.read_payload(mem, 2);
                let reference = self.read_payload(mem, 3);
                let mask = self.read_payload(mem, 4);
                let is_memory = (wait_info & 0x10) != 0;
                let cmp = WaitCmp::from_wait_info(wait_info);
                let poll_addr = if is_memory {
                    // Physical memory poll address → committed backing.
                    physical_to_backing(poll_addr_raw & !3)
                } else {
                    poll_addr_raw
                };
                let block = GpuBlock::WaitRegMem {
                    poll_addr,
                    is_memory,
                    reference,
                    mask,
                    cmp,
                };
                // A WAIT polling COHER_STATUS_HOST is the CP coherency
                // handshake: service it now so the status bit clears (see
                // `make_coherent`), exactly as canary does in its WAIT loop.
                if !is_memory {
                    self.make_coherent(poll_addr);
                }
                if block.is_satisfied(mem, &self.register_file) {
                    // Condition already true; proceed past this packet.
                    tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied");
                } else {
                    self.stats.wait_reg_mem_blocks += 1;
                    tracing::debug!(?block, "gpu: WAIT_REG_MEM parking");
                    self.pending_block = Some(block);
                    return Type3Result::Blocked { rewind_to_header: true };
                }
            }
            pm4::PM4_REG_RMW => {
                // payload[0] = rmw_info (bit 31 = and_from_reg, bit 30 = or_from_reg)
                // payload[1] = and mask (or register index)
                // payload[2] = or mask (or register index)
                let rmw_info = self.read_payload(mem, 1);
                let and_or_reg = (rmw_info & 0x8000_0000) != 0;
                let or_from_reg = (rmw_info & 0x4000_0000) != 0;
                let reg_index = rmw_info & 0x1FFF;
                let p2 = self.read_payload(mem, 2);
                let p3 = self.read_payload(mem, 3);
                let and_mask = if and_or_reg {
                    self.register_file.read(p2 & 0x1FFF)
                } else {
                    p2
                };
                let or_mask = if or_from_reg {
                    self.register_file.read(p3 & 0x1FFF)
                } else {
                    p3
                };
                let cur = self.register_file.read(reg_index);
                let new_value = (cur & and_mask) | or_mask;
                self.register_file.write(reg_index, new_value);
                tracing::trace!(
                    reg = format_args!("{reg_index:#x}"),
                    cur = format_args!("{cur:#x}"),
                    new = format_args!("{new_value:#x}"),
                    "gpu: REG_RMW"
                );
            }
            pm4::PM4_REG_TO_MEM => {
                // payload[0] = reg_index, payload[1] = mem addr
                let reg_index = self.read_payload(mem, 1) & 0x1FFF;
                let dst = physical_to_backing(self.read_payload(mem, 2) & !3);
                let value = self.register_file.read(reg_index);
                mem.write_u32(dst, value);
                tracing::trace!(
                    reg = format_args!("{reg_index:#x}"),
                    dst = format_args!("{dst:#010x}"),
                    value = format_args!("{value:#x}"),
                    "gpu: REG_TO_MEM"
                );
            }
            pm4::PM4_MEM_WRITE => {
                // payload[0] = dst, payload[1..=count-1] = values
                let mut dst = physical_to_backing(self.read_payload(mem, 1) & !3);
                for i in 2..=count {
                    let val = self.read_payload(mem, i);
                    mem.write_u32(dst, val);
                    dst = dst.wrapping_add(4);
                }
            }
            pm4::PM4_COND_WRITE => {
                // payload[0] = wait_info, [1] = poll addr, [2] = ref, [3] = mask,
                // [4] = write addr/reg, [5] = write data
                let wait_info = self.read_payload(mem, 1);
                let poll_raw = self.read_payload(mem, 2);
                let reference = self.read_payload(mem, 3);
                let mask = self.read_payload(mem, 4);
                let is_memory = (wait_info & 0x10) != 0;
                let cmp = WaitCmp::from_wait_info(wait_info);
                let poll_addr = if is_memory { physical_to_backing(poll_raw & !3) } else { poll_raw };
                let cur_raw = if is_memory {
                    mem.read_u32(poll_addr)
                } else {
                    self.register_file.read(poll_addr)
                };
                if cmp.evaluate(cur_raw & mask, reference) {
                    let write_addr = self.read_payload(mem, 5);
                    let write_data = self.read_payload(mem, 6);
                    if (wait_info & 0x100) != 0 {
                        mem.write_u32(physical_to_backing(write_addr & !3), write_data);
                    } else {
                        self.register_file
                            .write(write_addr & 0x1FFF, write_data);
                    }
                }
            }
            pm4::PM4_EVENT_WRITE => {
                // payload[0] = initiator (written to VGT_EVENT_INITIATOR).
                let initiator = self.read_payload(mem, 1);
                self.register_file
                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
                self.handle_event_initiator(initiator & 0x3F, mem);
                tracing::trace!(initiator = format_args!("{:#x}", initiator & 0x3F), "gpu: EVENT_WRITE");
            }
            pm4::PM4_EVENT_WRITE_SHD => {
                // payload[0] = initiator (bit 31: write counter, else write `value`)
                // payload[1] = address, payload[2] = value
                let initiator = self.read_payload(mem, 1);
                let address = physical_to_backing(self.read_payload(mem, 2));
                let value = self.read_payload(mem, 3);
                self.register_file
                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
                self.handle_event_initiator(initiator & 0x3F, mem);
                let data = if (initiator & 0x8000_0000) != 0 {
                    self.swap_counter as u32
                } else {
                    value
                };
                // M1.8: fenced write. The CPU thread busy-polls this
                // address as a GPU completion fence. The Release fence
                // emitted here pairs with `read_u32_fence`'s Acquire on
                // the polling side: any earlier writes the worker
                // performed (RPTR writeback, resolve target writes,
                // etc.) are visible to the CPU once it sees the new
                // fence value.
                mem.write_u32_fence(address & !3, data);
                tracing::trace!(
                    addr = format_args!("{:#010x}", address & !3),
                    value = format_args!("{data:#x}"),
                    "gpu: EVENT_WRITE_SHD"
                );
            }
            pm4::PM4_EVENT_WRITE_EXT => {
                // payload[0] = initiator, [1] = address. Writes 6 u16 extents
                // (min/max x/y/z) — we're not tracking scissors yet, so write zeros.
                let initiator = self.read_payload(mem, 1);
                let address = physical_to_backing(self.read_payload(mem, 2) & !3);
                self.register_file
                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
                self.handle_event_initiator(initiator & 0x3F, mem);
                for i in 0..6u32 {
                    mem.write_u16(address + i * 2, 0);
                }
            }
            pm4::PM4_EVENT_WRITE_ZPD => {
                // Occlusion query writeback — always write zeros (no query).
                let initiator = self.read_payload(mem, 1);
                self.register_file
                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
                self.handle_event_initiator(initiator & 0x3F, mem);
            }
            pm4::PM4_DRAW_INDX | pm4::PM4_DRAW_INDX_2 => {
                self.stats.draws_seen += 1;
                // Canary (`pm4_command_processor_implement.h:1128-1151`):
                //   DRAW_INDX: payload[0] = viz_query, payload[1] = vgt_draw_initiator,
                //              [2] = dma_base (if source=DMA), [3] = dma_size
                //   DRAW_INDX_2: payload[0] = vgt_draw_initiator (indices follow inline).
                let (vgt, dma_base, dma_size) = if opcode == pm4::PM4_DRAW_INDX {
                    let _viz = self.read_payload(mem, 1);
                    let vgt = self.read_payload(mem, 2);
                    let (db, ds) = if count >= 4 {
                        (Some(self.read_payload(mem, 3)), Some(self.read_payload(mem, 4)))
                    } else {
                        (None, None)
                    };
                    (vgt, db, ds)
                } else {
                    (self.read_payload(mem, 1), None, None)
                };
                let mut ds = draw_state::extract(&self.register_file, vgt, dma_base, dma_size);
                ds.vs_blob_key = self.active_vs_key;
                ds.ps_blob_key = self.active_ps_key;
                let processed = primitive::process(ds.primitive, ds.vertex_count, None);
                metrics::counter!(
                    "gpu.draw",
                    "prim" => format!("{:?}", ds.primitive),
                )
                .increment(1);
                if processed.rejected {
                    metrics::counter!("gpu.draw.rejected").increment(1);
                }
                // P4: update the render-target cache with every bound RT
                // from this draw. Each bind either inserts a new key or
                // refreshes an existing descriptor's bind_count. `msaa` is
                // still hardcoded to 1× because we don't yet decode
                // `PA_SC_AA_CONFIG`; P4b can add that.
                let msaa = crate::render_target_cache::MsaaSamples::X1;
                let mut viewport_height = ds.viewport.scale_y.abs() * 2.0;
                if viewport_height <= 0.0 {
                    viewport_height = 720.0;
                }
                // 16 samples per tile row (64-sample 8×8 macroblocks pack
                // 16 vertical samples per EDRAM tile).
                let rows_of_tiles = (viewport_height as u32).div_ceil(16);
                for (i, ci_opt) in ds.color_info.iter().enumerate() {
                    if let Some(ci) = ci_opt {
                        let pitch32 = ds.scissor.br_x.div_ceil(32);
                        let key = crate::render_target_cache::RenderTargetKey {
                            base_tiles: ci.base_tiles,
                            pitch_tiles_at_32bpp: pitch32,
                            msaa_samples: msaa,
                            is_depth: false,
                            resource_format: ci.format & 0xF,
                        };
                        let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
                        self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
                        metrics::counter!(
                            "gpu.rt.bind",
                            "slot" => format!("{i}"),
                            "fmt" => format!("{}", ci.format & 0xF),
                        )
                        .increment(1);
                    }
                }
                if let Some(depth) = ds.depth_info {
                    let pitch32 = ds.scissor.br_x.div_ceil(32);
                    let key = crate::render_target_cache::RenderTargetKey {
                        base_tiles: depth.base_tiles,
                        pitch_tiles_at_32bpp: pitch32,
                        msaa_samples: msaa,
                        is_depth: true,
                        resource_format: depth.format & 0xF,
                    };
                    let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
                    self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
                }
                self.stats.unique_render_targets = self.rt_cache.len() as u64;
                tracing::debug!(
                    opcode = format_args!("{opcode:#x}"),
                    prim = ?ds.primitive,
                    verts = ds.vertex_count,
                    ?processed.topology,
                    rewritten = processed.rewritten_indices.is_some(),
                    "gpu: DRAW_INDX captured"
                );
                self.last_draw = Some(ds);
                let host_vertex_count = processed.host_vertex_count;
                self.last_primitive = Some(processed);

                // iterate-3O: UI-only per-draw geometry capture. Resolves the
                // real guest vertex window behind this draw (from the active
                // VS's vertex-fetch constant) so the host UI can replay the
                // actual splash geometry instead of synthetic shapes. Entirely
                // inert in headless/deterministic mode (`frame_captures` is
                // `None`), so the `--gpu-inline` golden is unaffected.
                if self.frame_captures.is_some() {
                    let vs_key = self.active_vs_key.unwrap_or(0);
                    let ps_key = self.active_ps_key.unwrap_or(0);
                    let parsed_vs = self
                        .active_vs_key
                        .and_then(|k| self.shader_blobs.get(&k))
                        .map(|b| crate::ucode::parse_shader(&b.dwords));
                    let (idx_src, idx_size) = match ds.index_source {
                        crate::draw_state::IndexSource::Dma { index_size, .. } => {
                            (ds.index_source, index_size)
                        }
                        crate::draw_state::IndexSource::Immediate { index_size } => {
                            (ds.index_source, index_size)
                        }
                        crate::draw_state::IndexSource::AutoIndex => {
                            (ds.index_source, crate::draw_state::IndexSize::Sixteen)
                        }
                    };
                    let cap = crate::draw_capture::build(
                        self.stats.draws_seen as u32,
                        ds.primitive,
                        host_vertex_count,
                        idx_src,
                        idx_size,
                        vs_key,
                        ps_key,
                        parsed_vs.as_ref(),
                        &self.register_file,
                        mem,
                    );
                    if let Some(caps) = self.frame_captures.as_mut() {
                        // Bound the per-frame list so a runaway frame can't grow
                        // host memory without limit; keep the most recent.
                        const MAX_CAPS: usize = 4096;
                        if caps.len() >= MAX_CAPS {
                            caps.remove(0);
                        }
                        caps.push(cap);
                    }
                }

                // P5b: decode the textures the *active pixel shader* actually
                // samples. Parse the bound PS, collect its `tfetch`
                // fetch-constant slots, read each 6-dword fetch constant from
                // the register file, and decode+cache it. `vd_swap` publishes
                // the result. Empty for flat (no-tfetch) shaders — the
                // dominant case on Sylpheed's current splash, where this stays
                // inert until the textured logo draw is reached.
                self.last_draw_textures.clear();
                if let Some(ps_key) = self.active_ps_key {
                    // Collect slots under an immutable borrow of `shader_blobs`,
                    // then drop it before mutating `texture_cache`.
                    let slots: Vec<u8> = match self.shader_blobs.get(&ps_key) {
                        Some(blob) => {
                            let parsed = crate::ucode::parse_shader(&blob.dwords);
                            crate::shader_metrics::tfetch_slots(&parsed)
                        }
                        None => Vec::new(),
                    };
                    for slot in slots {
                        let mut fetch6 = [0u32; 6];
                        for (k, w) in fetch6.iter_mut().enumerate() {
                            *w = self
                                .register_file
                                .read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32);
                        }
                        let Some(mut key) = crate::texture_cache::decode_fetch_constant(fetch6) else {
                            continue;
                        };
                        // The Xenos texture fetch constant carries a guest
                        // *physical* base address (`base >> 12`). On the Xbox
                        // 360 the GPU reads the unified physical memory; the
                        // CPU writes the (decompressed) texels through its
                        // cached-physical aperture, which ours backs at the
                        // committed `0x4000_0000` window. Map the physical
                        // base onto that backing window so the GPU samples the
                        // bytes the guest actually wrote — exactly as the
                        // vertex-fetch path does (`draw_capture.rs`) and as
                        // canary reads textures through its GPU shared memory
                        // (= physical). Without this the decode reads the
                        // low VA `0x0dbee000` (always zero) instead of the
                        // filled `0x4dbee000`, flattening every disk-asset
                        // texture (e.g. the publisher logo `E59B2B3D`).
                        key.base_address = physical_to_backing(key.base_address);
                        let bi = key.format.block_info();
                        let span_bytes = (key.pitch_texels as u32)
                            * (key.height as u32)
                            * (bi.bytes_per_block as u32)
                            / (bi.block_w as u32);
                        let version = span_max_version(mem, key.base_address, span_bytes.max(4));
                        match self.texture_cache.ensure_cached(key, version, mem) {
                            Ok(entry) => {
                                self.last_draw_textures.push((entry.key, entry.bytes.clone()));
                                metrics::counter!(
                                    "gpu.texture.decode",
                                    "fmt" => format!("{:?}", key.format),
                                )
                                .increment(1);
                            }
                            Err(e) => {
                                metrics::counter!(
                                    "gpu.texture.reject",
                                    "reason" => format!("{e:?}"),
                                )
                                .increment(1);
                            }
                        }
                    }
                }

                // iterate-3T: attach this draw's decoded textures to the just-
                // captured draw so the UI can bind the real artwork per-draw
                // (keyed off the active PS's real tfetch slots) instead of a
                // single last-draw `primary_texture`. UI-only (`frame_captures`
                // is `None` headless); does not touch the deterministic core.
                if !self.last_draw_textures.is_empty()
                    && let Some(caps) = self.frame_captures.as_mut()
                    && let Some(last) = caps.last_mut()
                {
                    last.textures = self.last_draw_textures.clone();
                }
            }
            pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
                // payload[0] = offset_type — bits[10:0] index, bits[23:16] type
                let offset_type = self.read_payload(mem, 1);
                let index = offset_type & 0x7FF;
                let const_type = (offset_type >> 16) & 0xFF;
                let base = match const_type {
                    0 => CONST_BASE_ALU,
                    1 => CONST_BASE_FETCH,
                    2 => CONST_BASE_BOOL,
                    3 => CONST_BASE_LOOP,
                    4 => CONST_BASE_REGISTERS,
                    _ => CONST_BASE_ALU, // defensive default
                };
                for i in 0..(count - 1) {
                    let v = self.read_payload(mem, 2 + i);
                    self.register_file.write(base + index + i, v);
                }
            }
            pm4::PM4_SET_CONSTANT2 => {
                // payload[0] = 16-bit index; subsequent payloads write consecutive regs.
                let index = self.read_payload(mem, 1) & 0xFFFF;
                for i in 0..(count - 1) {
                    let v = self.read_payload(mem, 2 + i);
                    self.register_file.write(index + i, v);
                }
            }
            pm4::PM4_LOAD_ALU_CONSTANT => {
                // payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords
                let src = physical_to_backing(self.read_payload(mem, 1) & !3);
                let offset_type = self.read_payload(mem, 2);
                let size_dwords = self.read_payload(mem, 3);
                let index = offset_type & 0x7FF;
                let const_type = (offset_type >> 16) & 0xFF;
                let base = match const_type {
                    0 => CONST_BASE_ALU,
                    1 => CONST_BASE_FETCH,
                    2 => CONST_BASE_BOOL,
                    3 => CONST_BASE_LOOP,
                    4 => CONST_BASE_REGISTERS,
                    _ => CONST_BASE_ALU,
                };
                for i in 0..size_dwords {
                    let v = mem.read_u32(src + i * 4);
                    self.register_file.write(base + index + i, v);
                }
            }
            pm4::PM4_IM_LOAD | pm4::PM4_IM_LOAD_IMMEDIATE => {
                // Canary (pm4_command_processor_implement.h:1271-1330):
                //   IM_LOAD payload: [0] addr_type, [1] start_size
                //   IM_LOAD_IMMEDIATE payload: [0] shader_type, [1] start_size, [2..count] = microcode
                let shader_type = self.read_payload(mem, 1) as u8 & 0x3;
                let start_size = self.read_payload(mem, 2);
                let size_dwords = start_size & 0xFFFF;
                let blob = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
                    let mut v = Vec::with_capacity(size_dwords as usize);
                    for i in 0..size_dwords {
                        v.push(self.read_payload(mem, 3 + i));
                    }
                    v
                } else {
                    let addr = physical_to_backing(self.read_payload(mem, 1) & !3);
                    let mut v = Vec::with_capacity(size_dwords as usize);
                    for i in 0..size_dwords {
                        v.push(mem.read_u32(addr + i * 4));
                    }
                    v
                };
                // For IM_LOAD the payload already carries an address that
                // uniquely identifies this shader in guest memory, so the
                // full `addr_type` dword (address | stage bits) makes a
                // good cache key. For IM_LOAD_IMMEDIATE payload[0] is just
                // the 2-bit shader_type — without a content-derived key
                // every immediate upload would collide on 0 or 1 and
                // thrash a single slot. Fold the microcode through a
                // stable FNV-1a hash so per-content dedup still works.
                let key = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
                    fnv1a_u32_dwords(shader_type as u32, &blob)
                } else {
                    self.read_payload(mem, 1)
                };
                self.insert_shader_blob(
                    key,
                    ShaderBlob {
                        shader_type,
                        dwords: blob,
                    },
                );
                // P3b M1: record which blob is now "active" for the
                // current stage. The uber-shader dispatch (xenia-ui) reads
                // `active_vs_key`/`active_ps_key` at draw time to upload
                // the right microcode. `shader_type`: 0 = vertex, 1 = pixel
                // (per Xenos `ShaderType`).
                match shader_type {
                    0 => self.active_vs_key = Some(key),
                    1 => self.active_ps_key = Some(key),
                    _ => {}
                }
                metrics::counter!(
                    "gpu.shader.blob_seen",
                    "stage" => if shader_type == 0 { "vs" } else { "ps" },
                )
                .increment(1);
                tracing::debug!(
                    shader_type,
                    size_dwords,
                    key = format_args!("{key:#x}"),
                    "gpu: IM_LOAD (shader blob cached)"
                );
            }
            pm4::PM4_SET_BIN_MASK_LO => {
                self.bin_mask = (self.bin_mask & 0xFFFF_FFFF_0000_0000)
                    | (self.read_payload(mem, 1) as u64);
            }
            pm4::PM4_SET_BIN_MASK_HI => {
                self.bin_mask = (self.bin_mask & 0x0000_0000_FFFF_FFFF)
                    | ((self.read_payload(mem, 1) as u64) << 32);
            }
            pm4::PM4_SET_BIN_MASK => {
                let lo = self.read_payload(mem, 1) as u64;
                let hi = self.read_payload(mem, 2) as u64;
                self.bin_mask = (hi << 32) | lo;
            }
            pm4::PM4_SET_BIN_SELECT_LO => {
                self.bin_select = (self.bin_select & 0xFFFF_FFFF_0000_0000)
                    | (self.read_payload(mem, 1) as u64);
            }
            pm4::PM4_SET_BIN_SELECT_HI => {
                self.bin_select = (self.bin_select & 0x0000_0000_FFFF_FFFF)
                    | ((self.read_payload(mem, 1) as u64) << 32);
            }
            pm4::PM4_SET_BIN_SELECT => {
                let lo = self.read_payload(mem, 1) as u64;
                let hi = self.read_payload(mem, 2) as u64;
                self.bin_select = (hi << 32) | lo;
            }
            pm4::PM4_INTERRUPT => {
                let cpu_mask = self.read_payload(mem, 1);
                self.stats.interrupts_emitted += 1;
                self.pending_interrupts.push(PendingInterrupt {
                    source: InterruptSource::CommandProcessor,
                    cpu_mask,
                });
                tracing::debug!(
                    cpu_mask = format_args!("{cpu_mask:#x}"),
                    "gpu: PM4_INTERRUPT queued"
                );
            }
            pm4::PM4_XE_SWAP => {
                // Payload: [0] signature, [1] frontbuffer_phys, [2] width, [3] height
                let _signature = self.read_payload(mem, 1);
                let frontbuffer_phys = self.read_payload(mem, 2);
                let width = self.read_payload(mem, 3);
                let height = self.read_payload(mem, 4);
                self.notify_xe_swap(frontbuffer_phys, width, height);
            }
            _ => {
                // Unknown opcode — log once per opcode but don't stall.
                tracing::warn!(
                    opcode = format_args!("{opcode:#x}"),
                    count,
                    "gpu: unhandled Type3 opcode"
                );
            }
        }
        Type3Result::Consumed(total_dwords)
    }

    /// Read dword at payload-relative offset `i` (where `i=0` is the header).
    fn read_payload(&self, mem: &dyn MemoryAccess, i: u32) -> u32 {
        let addr = self.ring.addr_at_offset(i).unwrap();
        mem.read_u32(addr)
    }

    /// Drain up to `max_packets` (used by the kernel's VdSwap handler when we
    /// don't yet have MMIO-triggered draining). Returns the number of
    /// packets consumed.
    pub fn drain(&mut self, mem: &dyn MemoryAccess, max_packets: u32) -> u32 {
        let mut n = 0;
        for _ in 0..max_packets {
            match self.execute_one(mem) {
                ExecOutcome::Stepped { .. } => n += 1,
                ExecOutcome::Idle | ExecOutcome::Blocked => break,
            }
        }
        n
    }

    /// Drain until the ring's read offset reaches `target_wptr` (modulo ring
    /// size) or `execute_one` returns Idle/Blocked. Mirrors canary's
    /// `WorkerThreadMain` (xenia-canary `command_processor.cc` ExecutePrimaryBuffer)
    /// which loops on `read_ptr_index_ != write_ptr_index` with no packet
    /// budget. `time_budget` bounds wall-clock so a pathological packet
    /// (e.g. an EVENT_WRITE that perpetually re-blocks) cannot spin the
    /// inline path; pass 900 ms to match the threaded `DrainFence` deadline.
    /// Returns the number of packets consumed.
    pub fn drain_until_wptr(
        &mut self,
        mem: &dyn MemoryAccess,
        target_wptr: u32,
        time_budget: Duration,
    ) -> u32 {
        if self.ring.size_dwords == 0 {
            return 0;
        }
        let target = target_wptr % self.ring.size_dwords;
        let deadline = Instant::now() + time_budget;
        let mut n = 0u32;
        while self.ring.read_offset_dwords != target {
            if Instant::now() >= deadline {
                // Deadline exhaustion is the *expected* outcome under
                // `--parallel` workloads (Sylpheed boot queues millions
                // of game-batched IBs the inline drain can't chew
                // through in 900 ms). Logged at debug because warn-level
                // would fire on every vd_swap. Callers can re-read the
                // ring read pointer to detect partial drain if they
                // care.
                tracing::debug!(
                    target,
                    rptr = self.ring.read_offset_dwords,
                    consumed = n,
                    "gpu: drain_until_wptr time-budget exhausted"
                );
                break;
            }
            match self.execute_one(mem) {
                ExecOutcome::Stepped { .. } => {
                    n += 1;
                    // Mirror the threaded `DrainFence` handler at
                    // handle.rs:553-570: re-sync after every packet so
                    // any concurrent guest WPTR write (under `--parallel`)
                    // folds into the local ring view before the next
                    // `is_ready` check. Without this the local
                    // write_offset is a snapshot of the moment we entered
                    // the drain, which is fine for a target-WPTR drain
                    // but wrong if downstream packets (e.g. an indirect
                    // buffer's nested ring) need an updated view.
                    self.sync_with_mmio();
                }
                ExecOutcome::Idle | ExecOutcome::Blocked => break,
            }
        }
        n
    }
}

impl Default for GpuSystem {
    fn default() -> Self {
        Self::new()
    }
}

/// Subset of Xenos registers we reference by name. Full table at
/// `xenia-canary/src/xenia/gpu/registers.h`.
pub mod reg {
    //! All values below are Xenos *register indices* (the number you find in
    //! canary's `register_table.inc`, i.e. the byte offset within the
    //! aperture divided by 4). The MMIO aperture at `0x7FC8_0000` maps each
    //! register at `APERTURE_BASE + index * 4`; the MMIO callbacks recover
    //! the index with `(addr & 0xFFFF) / 4` before matching against these
    //! constants.

    /// `XE_GPU_REG_CP_RB_BASE` — ring buffer base address.
    pub const CP_RB_BASE: u32 = 0x01C0;
    /// `XE_GPU_REG_CP_RB_CNTL` — ring buffer control.
    pub const CP_RB_CNTL: u32 = 0x01C1;
    /// `XE_GPU_REG_CP_RB_RPTR_ADDR` — where to mirror read pointer.
    pub const CP_RB_RPTR_ADDR: u32 = 0x01C3;
    /// `XE_GPU_REG_CP_RB_RPTR` — read pointer (GPU → CPU).
    pub const CP_RB_RPTR: u32 = 0x01C4;
    /// `XE_GPU_REG_CP_RB_WPTR` — write pointer (CPU → GPU); MMIO side effect.
    pub const CP_RB_WPTR: u32 = 0x01C5;
    /// `XE_GPU_REG_CP_INT_STATUS` — interrupt status bits.
    pub const CP_INT_STATUS: u32 = 0x01F3;
    /// `XE_GPU_REG_CP_INT_ACK` — write-to-ack interrupt bits.
    pub const CP_INT_ACK: u32 = 0x01F4;
    /// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
    /// Bit 0 = VBLANK_INT_OCCURRED.
    pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
    /// `XE_GPU_REG_D1MODE_VIEWPORT_SIZE` / `AVIVO_D1MODE_VIEWPORT_SIZE`
    /// (Canary `register_table.inc:1134`). Packs the active display resolution
    /// as `(width << 16) | height` with 12-bit fields. The guest's
    /// swap-complete interrupt callback (`sub_824CE2B8`) divides by the low
    /// 12 bits (`height`) as a refresh-pacing term, so a 0 read makes its
    /// `twi` divide-by-zero guard trap and abort the ISR before it clears the
    /// swap-acknowledge fence. Canary returns the constant below from
    /// `GraphicsSystem::ReadRegister` (graphics_system.cc:311).
    pub const D1MODE_VIEWPORT_SIZE: u32 = 0x1961;
    /// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
    pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
    /// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
    /// (Canary `register_table.inc:530`).
    pub const COHER_STATUS_HOST: u32 = 0x0A31;
    /// `XE_GPU_REG_SCRATCH_UMSK` — bitmask of which `SCRATCH_REG{n}` writes are
    /// mirrored to memory (Canary `register_table.inc:139`).
    pub const SCRATCH_UMSK: u32 = 0x01DC;
    /// `XE_GPU_REG_SCRATCH_ADDR` — base physical address of the scratch
    /// writeback block (Canary `register_table.inc:141`).
    pub const SCRATCH_ADDR: u32 = 0x01DD;
    /// `XE_GPU_REG_SCRATCH_REG0` — first of 8 CP scratch registers
    /// (`0x0578..=0x057F`, Canary `register_table.inc:331-338`).
    pub const SCRATCH_REG0: u32 = 0x0578;
    /// `XE_GPU_REG_SCRATCH_REG7` — last CP scratch register.
    pub const SCRATCH_REG7: u32 = 0x057F;
}

/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a
/// stable, collision-resistant cache key for `IM_LOAD_IMMEDIATE` shader
/// blobs (where the guest supplies no natural address to key on).
fn fnv1a_u32_dwords(seed: u32, dwords: &[u32]) -> u32 {
    const FNV_OFFSET: u32 = 0x811C_9DC5;
    const FNV_PRIME: u32 = 0x0100_0193;
    let mut hash = FNV_OFFSET;
    for byte in seed.to_le_bytes() {
        hash ^= byte as u32;
        hash = hash.wrapping_mul(FNV_PRIME);
    }
    for dw in dwords {
        for byte in dw.to_le_bytes() {
            hash ^= byte as u32;
            hash = hash.wrapping_mul(FNV_PRIME);
        }
    }
    hash
}

/// Internal Type-3 handler result. Distinguishes "consumed a packet (by N
/// dwords)" from "blocked; don't advance read ptr".
enum Type3Result {
    Consumed(u32),
    Blocked { rewind_to_header: bool },
}

#[cfg(test)]
mod tests {
    use super::*;
    use xenia_memory::GuestMemory;
    use xenia_memory::page_table::MemoryProtect;

    fn build_mem() -> GuestMemory {
        let mut mem = GuestMemory::new().unwrap();
        let rw = MemoryProtect::READ | MemoryProtect::WRITE;
        mem.alloc(0x4000_0000, 0x4000, rw).unwrap();
        mem
    }

    #[test]
    fn ready_when_ring_has_pending() {
        let mut gpu = GpuSystem::new();
        let mem = build_mem();
        assert!(!gpu.is_ready(&mem));
        gpu.initialize_ring_buffer(0x4000_0000, 10); // 1024 bytes = 256 dwords
        assert!(!gpu.is_ready(&mem));
        gpu.extend_write_ptr(4);
        assert!(gpu.is_ready(&mem));
    }

    #[test]
    fn type2_nop_advances_read_pointer() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        // 256 dwords ring at 0x40000000
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // Push 3 Type-2 NOPs
        for i in 0..3u32 {
            mem.write_u32(0x4000_0000 + i * 4, 0x8000_0000);
        }
        gpu.extend_write_ptr(3);
        for _ in 0..3 {
            match gpu.execute_one(&mut mem) {
                ExecOutcome::Stepped { dwords_consumed } => assert_eq!(dwords_consumed, 1),
                other => panic!("unexpected {:?}", other),
            }
        }
        assert_eq!(gpu.ring.read_offset_dwords, 3);
        assert_eq!(gpu.stats.packets_executed, 3);
    }

    #[test]
    fn type0_reg_run_writes_register_file() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // Type0 header: 2 dwords, base_index=0x100 → write_one=0 → count field = 1 (count-1)
        let hdr = (1u32 << 16) | 0x100;
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, 0xDEAD_BEEF);
        mem.write_u32(0x4000_0008, 0xCAFE_BABE);
        gpu.extend_write_ptr(3);
        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
        assert_eq!(gpu.register_file.read(0x100), 0xDEAD_BEEF);
        assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE);
    }

    #[test]
    fn scratch_reg_write_mirrors_to_memory_when_umsk_enabled() {
        // Mirrors Sylpheed's CP swap-callback arming: SCRATCH_ADDR points at a
        // descriptor, SCRATCH_UMSK enables bit 4, and a Type-0 write of the
        // callback PC into SCRATCH_REG4 (0x57C) must land at SCRATCH_ADDR + 16.
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // Program SCRATCH_ADDR = 0x4000_1000 (physical-mirror identity), and
        // SCRATCH_UMSK = bit 4 only (so SCRATCH_REG4 mirrors, REG3 does not).
        gpu.register_file.write(reg::SCRATCH_ADDR, 0x4000_1000);
        gpu.register_file.write(reg::SCRATCH_UMSK, 1 << 4);
        // Type0 write run: base = SCRATCH_REG3 (0x57B), count = 2 → writes
        // 0x11111111 → SCRATCH_REG3 (UMSK bit 3 clear), 0x824CE2B8 →
        // SCRATCH_REG4 (UMSK bit 4 set → mirrored to ADDR + 4*4 = +16).
        const SCRATCH_REG3: u32 = 0x057B;
        let hdr = (1u32 << 16) | SCRATCH_REG3;
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, 0x1111_1111);
        mem.write_u32(0x4000_0008, 0x824C_E2B8);
        gpu.extend_write_ptr(3);
        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
        // SCRATCH_REG3 (bit 3 clear) must NOT mirror; SCRATCH_REG4 (bit 4 set)
        // must mirror to SCRATCH_ADDR + 16.
        assert_eq!(mem.read_u32(0x4000_1000 + 12), 0, "reg3 must not mirror");
        assert_eq!(
            mem.read_u32(0x4000_1000 + 16),
            0x824C_E2B8,
            "reg4 must mirror to SCRATCH_ADDR+16"
        );
    }

    #[test]
    fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // WAIT_REG_MEM: wait until *0x40001000 == 0x42
        // header
        let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8);
        mem.write_u32(0x4000_0000, hdr);
        // wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 3, per canary's
        // MatchValueAndRef selector: 1=Less, 2=LessEq, 3=Equal, …).
        mem.write_u32(0x4000_0004, 0x13);
        mem.write_u32(0x4000_0008, 0x4000_1000);
        mem.write_u32(0x4000_000C, 0x42);
        mem.write_u32(0x4000_0010, 0xFFFF_FFFF);
        mem.write_u32(0x4000_0014, 0);
        gpu.extend_write_ptr(6);
        // First exec: poll addr reads 0 → blocked.
        assert_eq!(gpu.execute_one(&mut mem), ExecOutcome::Blocked);
        assert_eq!(gpu.ring.read_offset_dwords, 0, "read ptr must not advance while blocked");
        // Make the wait satisfied.
        mem.write_u32(0x4000_1000, 0x42);
        match gpu.execute_one(&mut mem) {
            ExecOutcome::Stepped { dwords_consumed } => {
                // The WAIT_REG_MEM packet is 1 (header) + 5 (count) = 6 dwords.
                assert_eq!(dwords_consumed, 6);
            }
            other => panic!("expected Stepped after wait satisfied, got {:?}", other),
        }
        assert_eq!(gpu.ring.read_offset_dwords, 6);
    }

    #[test]
    fn mem_write_writes_all_payload_dwords() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // MEM_WRITE: count=3 → 1 header + 1 dst + 2 data
        let hdr = (3u32 << 30) | ((3u32 - 1) << 16) | ((pm4::PM4_MEM_WRITE as u32) << 8);
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, 0x4000_1000); // dst
        mem.write_u32(0x4000_0008, 0x1111_1111);
        mem.write_u32(0x4000_000C, 0x2222_2222);
        gpu.extend_write_ptr(4);
        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
        assert_eq!(mem.read_u32(0x4000_1000), 0x1111_1111);
        assert_eq!(mem.read_u32(0x4000_1004), 0x2222_2222);
    }

    #[test]
    fn mmio_write_to_cp_rb_wptr_reflects_into_ring() {
        use std::sync::atomic::Ordering;
        let mut gpu = GpuSystem::new();
        let mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // Guest writes wptr=8 via MMIO.
        gpu.mmio.cp_rb_wptr.store(8, Ordering::Relaxed);
        // Before sync, ring has no pending work.
        assert!(!gpu.is_ready(&mem));
        gpu.sync_with_mmio();
        assert_eq!(gpu.ring.write_offset_dwords, 8);
        assert!(gpu.is_ready(&mem));
        // After sync, rptr is mirrored back to mmio for the guest to read.
        assert_eq!(gpu.mmio.cp_rb_rptr.load(Ordering::Relaxed), 0);
    }

    /// End-to-end: feed two DRAW_INDX_2 packets through `execute_one` and
    /// verify the GPU system reports the expected `draws_seen` / `last_draw`
    /// state that the UI's Xenos pipeline consumes. Acts as the "draw
    /// dispatch integration" check mentioned in the P3 verification plan.
    #[test]
    fn successive_draws_accumulate_in_stats() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        let mk_draw = |addr: u32, vgt: u32, mem: &xenia_memory::GuestMemory| {
            let hdr = (3u32 << 30) | ((1u32 - 1) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
            mem.write_u32(addr, hdr);
            mem.write_u32(addr + 4, vgt);
        };
        // Draw #1: TriangleList, 6 verts.
        mk_draw(0x4000_0000, (6u32 << 16) | (2 << 6) | 4, &mut mem);
        // Draw #2: TriangleStrip, 4 verts.
        mk_draw(0x4000_0008, (4u32 << 16) | (2 << 6) | 6, &mut mem);
        gpu.extend_write_ptr(4);
        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
        assert_eq!(gpu.stats.draws_seen, 2);
        let ds = gpu.last_draw.expect("last_draw set");
        assert_eq!(ds.primitive, crate::draw_state::PrimitiveType::TriangleStrip);
        assert_eq!(ds.vertex_count, 4);
    }

    #[test]
    fn draw_indx_2_captures_last_draw() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // DRAW_INDX_2 packet with 1 payload dword = vgt_draw_initiator:
        //   prim=4 (TriangleList), source=2 (auto), count=3 verts.
        let vgt = (3u32 << 16) | (2 << 6) | 4;
        let hdr = (3u32 << 30) | ((1u32 - 0) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
        // count field in header = (total_dwords_after - 1); we have 1 payload = count=1 → encoded 0.
        let hdr = (hdr & !0x3FFF_0000) | ((1u32 - 1) << 16);
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, vgt);
        gpu.extend_write_ptr(2);
        assert!(matches!(
            gpu.execute_one(&mut mem),
            ExecOutcome::Stepped { .. }
        ));
        assert_eq!(gpu.stats.draws_seen, 1);
        let ds = gpu.last_draw.expect("last_draw set");
        assert_eq!(
            ds.primitive,
            crate::draw_state::PrimitiveType::TriangleList
        );
        assert_eq!(ds.vertex_count, 3);
        let p = gpu.last_primitive.as_ref().expect("last_primitive set");
        assert_eq!(p.topology, crate::primitive::HostTopology::TriangleList);
        assert!(!p.rejected);
    }

    /// P3b M1: IM_LOAD_IMMEDIATE must set `active_vs_key` / `active_ps_key`
    /// based on `shader_type`, and a subsequent DRAW_INDX must carry those
    /// P8: shader-blob FIFO evicts the oldest non-active blob when the
    /// cache crosses `SHADER_BLOB_CAP`. Active keys are protected.
    #[test]
    fn shader_blob_cap_evicts_oldest() {
        let mut gpu = GpuSystem::new();
        gpu.active_vs_key = Some(u32::MAX);
        // Insert unique keys (starting at 1_000 to avoid colliding with
        // the active-key sentinel) up to `CAP + 10`; every insert fires
        // the eviction path once len > CAP.
        gpu.insert_shader_blob(
            u32::MAX,
            ShaderBlob {
                shader_type: 0,
                dwords: vec![0xAA; 4],
            },
        );
        let first_key = 1_000u32;
        for k in first_key..(first_key + SHADER_BLOB_CAP as u32 + 10) {
            gpu.insert_shader_blob(
                k,
                ShaderBlob {
                    shader_type: 0,
                    dwords: vec![k; 2],
                },
            );
        }
        assert!(gpu.shader_blobs.len() <= SHADER_BLOB_CAP);
        // Active key (u32::MAX) must still be present.
        assert!(gpu.shader_blobs.contains_key(&u32::MAX));
        // Earliest non-active key must have been evicted (at least one of
        // the first 10 we inserted is gone).
        let evicted = (first_key..first_key + 10)
            .filter(|k| !gpu.shader_blobs.contains_key(k))
            .count();
        assert!(
            evicted > 0,
            "expected at least one of the first 10 keys to be evicted, \
             got shader_blobs.len() = {}",
            gpu.shader_blobs.len()
        );
    }

    /// `IM_LOAD_IMMEDIATE` uploads a vertex + pixel shader inline; draw
    /// state must then carry whichever keys the executor minted. With the
    /// content-hashed key scheme, vs and ps keys differ because their
    /// microcode bytes differ — the concrete values are derived, so the
    /// test just asserts both are non-zero and not equal.
    #[test]
    fn im_load_records_active_blob_and_draw_carries_it() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);

        // PM4_IM_LOAD_IMMEDIATE VS: 4 data dwords = shader_type + start_size
        // + 2 code. Header count field = data_count - 1 = 3.
        let hdr_vs = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
        mem.write_u32(0x4000_0000, hdr_vs);
        mem.write_u32(0x4000_0004, 0); // shader_type=0 (vertex)
        mem.write_u32(0x4000_0008, 2); // start_size: size=2
        mem.write_u32(0x4000_000C, 0xAAAA_AAAA);
        mem.write_u32(0x4000_0010, 0xBBBB_BBBB);

        // Second IM_LOAD_IMMEDIATE PS (shader_type=1); 5 dwords total.
        let hdr_ps = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
        mem.write_u32(0x4000_0014, hdr_ps);
        mem.write_u32(0x4000_0018, 1); // shader_type=1 (pixel)
        mem.write_u32(0x4000_001C, 2);
        mem.write_u32(0x4000_0020, 0xCCCC_CCCC);
        mem.write_u32(0x4000_0024, 0xDDDD_DDDD);

        // DRAW_INDX_2: 1 data dword, count field = 0.
        let vgt = (3u32 << 16) | (2 << 6) | 4;
        let hdr_draw = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
        mem.write_u32(0x4000_0028, hdr_draw);
        mem.write_u32(0x4000_002C, vgt);

        // Total dwords consumed by the 3 packets: 5 + 5 + 2 = 12.
        gpu.extend_write_ptr(12);
        // Drain all three packets.
        for _ in 0..3 {
            assert!(matches!(
                gpu.execute_one(&mut mem),
                ExecOutcome::Stepped { .. }
            ));
        }
        let vs_key = gpu.active_vs_key.expect("vs key set by IM_LOAD_IMMEDIATE");
        let ps_key = gpu.active_ps_key.expect("ps key set by IM_LOAD_IMMEDIATE");
        assert_ne!(vs_key, ps_key, "VS and PS keys must be distinct");
        let ds = gpu.last_draw.expect("DRAW_INDX_2 captured");
        assert_eq!(ds.vs_blob_key, Some(vs_key));
        assert_eq!(ds.ps_blob_key, Some(ps_key));
    }

    /// Regression: before the content-hash keying, two distinct vertex
    /// shaders uploaded via `IM_LOAD_IMMEDIATE` both mapped to key `0`
    /// (the shader_type dword) and overwrote each other in `shader_blobs`.
    /// With FNV-1a over the microcode, different blobs get different keys
    /// and the cache retains both.
    #[test]
    fn im_load_immediate_distinct_microcode_does_not_collide() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);

        let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);

        // VS shader A.
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, 0); // shader_type = vertex
        mem.write_u32(0x4000_0008, 2); // size = 2
        mem.write_u32(0x4000_000C, 0x1111_1111);
        mem.write_u32(0x4000_0010, 0x2222_2222);

        // VS shader B — same stage, different microcode.
        mem.write_u32(0x4000_0014, hdr);
        mem.write_u32(0x4000_0018, 0);
        mem.write_u32(0x4000_001C, 2);
        mem.write_u32(0x4000_0020, 0x3333_3333);
        mem.write_u32(0x4000_0024, 0x4444_4444);

        gpu.extend_write_ptr(10);
        for _ in 0..2 {
            assert!(matches!(
                gpu.execute_one(&mut mem),
                ExecOutcome::Stepped { .. }
            ));
        }
        assert_eq!(
            gpu.shader_blobs.len(),
            2,
            "two distinct VS shaders must not collide on the same cache key"
        );
    }

    /// P4: `EVENT_WRITE` with initiator `15` (TILE_FLUSH) must route
    /// through the resolve handler — captured `last_resolve` + incremented
    /// `stats.resolves_total` proves the dispatch works.
    #[test]
    fn tile_flush_event_records_resolve() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // Pre-seed RB_COPY_DEST_BASE / DEST_PITCH / DEST_INFO so
        // ResolveInfo captures recognisable values.
        gpu.register_file
            .write(draw_state::reg::RB_COPY_DEST_BASE, 0xDEAD_0000);
        gpu.register_file.write(
            draw_state::reg::RB_COPY_DEST_PITCH,
            (720u32 << 16) | 1280u32,
        );
        // copy_dest_format=6 (k_8_8_8_8), copy_dest_endian=0.
        gpu.register_file
            .write(draw_state::reg::RB_COPY_DEST_INFO, 6u32 << 7);
        gpu.register_file.write(
            draw_state::reg::RB_COPY_CONTROL,
            (1u32 << 20) /* copy_command=1 */ | (1u32 << 8), /* color_clear_enable */
        );

        // PM4_EVENT_WRITE: 1 data dword — the initiator.
        let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_EVENT_WRITE as u32) << 8);
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, 15); // TILE_FLUSH
        gpu.extend_write_ptr(2);
        assert!(matches!(
            gpu.execute_one(&mut mem),
            ExecOutcome::Stepped { .. }
        ));
        assert_eq!(gpu.stats.resolves_total, 1);
        let info = gpu.last_resolve.expect("TILE_FLUSH captured resolve");
        // `0xDEAD_0000 & 0x1FFF_FFFF = 0x1EAD_0000` — `dest_base` is now
        // masked to the Xenon 29-bit physical range at decode time.
        assert_eq!(info.dest_base, 0x1EAD_0000);
        assert_eq!(info.dest_pitch_pixels, 1280);
        assert_eq!(info.dest_height_pixels, 720);
        assert_eq!(info.dest_format, 6);
        assert_eq!(info.copy_command, 1);
        assert!(info.color_clear_enable);
    }

    /// P4: DRAW_INDX* with a bound color target should populate
    /// `rt_cache` so downstream stages (HUD, resolve) can look up the RT.
    #[test]
    fn draw_indx_populates_rt_cache() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        // color0 enabled + format=2 (k_8_8_8_8 or similar), base=0x10.
        gpu.register_file.write(draw_state::reg::RB_MODECONTROL, 0x1);
        gpu.register_file
            .write(draw_state::reg::RB_COLOR_INFO_0, (2u32 << 16) | 0x10);
        // Non-zero scissor so pitch32 calc is meaningful.
        gpu.register_file.write(
            draw_state::reg::PA_SC_WINDOW_SCISSOR_BR,
            (720u32 << 16) | 1280u32,
        );
        let vgt = (3u32 << 16) | (2 << 6) | 4;
        let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, vgt);
        gpu.extend_write_ptr(2);
        assert!(matches!(
            gpu.execute_one(&mut mem),
            ExecOutcome::Stepped { .. }
        ));
        assert_eq!(gpu.rt_cache.len(), 1);
        assert_eq!(gpu.stats.unique_render_targets, 1);
    }

    #[test]
    fn xe_swap_records_notification() {
        let mut gpu = GpuSystem::new();
        let mut mem = build_mem();
        gpu.initialize_ring_buffer(0x4000_0000, 10);
        let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
        mem.write_u32(0x4000_0000, hdr);
        mem.write_u32(0x4000_0004, pm4::SWAP_SIGNATURE);
        mem.write_u32(0x4000_0008, 0xCAFE_0000);
        mem.write_u32(0x4000_000C, 1280);
        mem.write_u32(0x4000_0010, 720);
        gpu.extend_write_ptr(5);
        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
        let swap = gpu.last_swap.unwrap();
        assert_eq!(swap.frame_index, 1);
        assert_eq!(swap.frontbuffer_phys, 0xCAFE_0000);
        assert_eq!(swap.width, 1280);
        assert_eq!(swap.height, 720);
        assert_eq!(gpu.stats.swaps_seen, 1);
    }
}