diff --git a/crates/xenia-gpu/Cargo.toml b/crates/xenia-gpu/Cargo.toml
index fe02e00..ca1a775 100644
--- a/crates/xenia-gpu/Cargo.toml
+++ b/crates/xenia-gpu/Cargo.toml
@@ -11,3 +11,11 @@ tracing = { workspace = true }
 thiserror = { workspace = true }
 anyhow = { workspace = true }
 byteorder = { workspace = true }
+metrics = { workspace = true }
+bytemuck = { workspace = true }
+crossbeam-channel = { workspace = true }
+
+[dev-dependencies]
+# Used to validate bundled WGSL placeholders compile cleanly. Matches the
+# wgpu-22 transitive dep so we don't pull in a second naga version.
+naga = { version = "22", features = ["wgsl-in"] }
diff --git a/crates/xenia-gpu/src/draw_state.rs b/crates/xenia-gpu/src/draw_state.rs
new file mode 100644
index 0000000..0b078a8
--- /dev/null
+++ b/crates/xenia-gpu/src/draw_state.rs
@@ -0,0 +1,1113 @@
+//! Extract draw state from the Xenos register file at `PM4_DRAW_INDX` time.
+//!
+//! This is the "what are we drawing?" snapshot: primitive type, vertex count,
+//! index buffer (if any), viewport, scissor, blend, depth state, and enough
+//! handles for a future translator / uber-shader to pull fetch constants +
+//! shader blobs. Ground truth: `xenia-canary/src/xenia/gpu/draw_util.h` and
+//! the PM4 handler at `pm4_command_processor_implement.h:1128-1151`.
+//!
+//! We only extract what the P3 uber-shader actually consumes; the rest is
+//! reserved for later phases.
+
+use crate::register_file::RegisterFile;
+
+/// Primitive type (Xenos `PrimitiveType` enum from `xenos.h`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PrimitiveType {
+    None,
+    PointList,
+    LineList,
+    LineStrip,
+    TriangleList,
+    TriangleFan,
+    TriangleStrip,
+    RectangleList,
+    QuadList,
+    Unknown(u8),
+}
+
+impl PrimitiveType {
+    pub fn from_bits(b: u32) -> Self {
+        match b & 0x3F {
+            0 => PrimitiveType::None,
+            1 => PrimitiveType::PointList,
+            2 => PrimitiveType::LineList,
+            3 => PrimitiveType::LineStrip,
+            4 => PrimitiveType::TriangleList,
+            5 => PrimitiveType::TriangleFan,
+            6 => PrimitiveType::TriangleStrip,
+            8 => PrimitiveType::RectangleList,
+            13 => PrimitiveType::QuadList,
+            other => PrimitiveType::Unknown(other as u8),
+        }
+    }
+}
+
+/// How the draw was issued per `VGT_DRAW_INITIATOR.source_select`:
+/// 0=DMA, 1=Immediate (in-packet indices), 2=AutoIndex.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum IndexSource {
+    /// Index buffer fetched from `VGT_DMA_BASE` / `VGT_DMA_SIZE`.
+    Dma {
+        base_address: u32,
+        size_dwords: u32,
+        index_size: IndexSize,
+    },
+    /// Indices follow the `DRAW_INDX_2` packet header inline.
+    Immediate { index_size: IndexSize },
+    /// No index buffer; generate `0..vertex_count - 1` on the host.
+    AutoIndex,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum IndexSize {
+    /// 16-bit indices.
+    Sixteen,
+    /// 32-bit indices.
+    ThirtyTwo,
+}
+
+/// Snapshot of one draw call's state, sampled from the register file.
+#[derive(Debug, Clone, Copy)]
+pub struct DrawState {
+    pub primitive: PrimitiveType,
+    pub vertex_count: u32,
+    pub index_source: IndexSource,
+    pub viewport: Viewport,
+    pub scissor: Scissor,
+    /// RB_COLOR_INFO for each of the 4 possible color render targets; `None`
+    /// where the target is not bound.
+    pub color_info: [Option<ColorTargetInfo>; 4],
+    pub depth_info: Option<DepthTargetInfo>,
+    pub rb_modecontrol: u32,
+    pub rb_colorcontrol: u32,
+    pub rb_depthcontrol: u32,
+    /// P4: per-color-target blend state. Index matches `color_info`.
+    pub rb_blendcontrol: [u32; 4],
+    /// P4: stencil state.
+    pub rb_stencilrefmask: u32,
+    pub rb_stencilrefmask_bf: u32,
+    /// P4: pixel offset applied at rasterization.
+    pub pa_sc_window_offset: u32,
+    /// P4: resolve destination registers (`RB_COPY_*`). These are set by
+    /// the guest just before triggering a TILE_FLUSH event and describe
+    /// where an EDRAM→texture copy should land.
+    pub rb_copy_control: u32,
+    pub rb_copy_dest_base: u32,
+    pub rb_copy_dest_pitch: u32,
+    pub rb_copy_dest_info: u32,
+    /// Key of the VS blob that was active at draw time (from
+    /// `GpuSystem::active_vs_key`). `None` = no VS loaded yet; the draw is
+    /// meaningless and will be rejected by the dispatcher.
+    pub vs_blob_key: Option<u32>,
+    /// Key of the PS blob that was active at draw time.
+    pub ps_blob_key: Option<u32>,
+}
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct Viewport {
+    pub scale_x: f32,
+    pub scale_y: f32,
+    pub scale_z: f32,
+    pub offset_x: f32,
+    pub offset_y: f32,
+    pub offset_z: f32,
+}
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct Scissor {
+    pub tl_x: u16,
+    pub tl_y: u16,
+    pub br_x: u16,
+    pub br_y: u16,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct ColorTargetInfo {
+    /// EDRAM tile base for this color target (`RB_COLOR_INFO.base_tiles`).
+    pub base_tiles: u16,
+    /// Color format (`RB_COLOR_INFO.color_format`).
+    pub format: u8,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct DepthTargetInfo {
+    /// EDRAM tile base for depth/stencil.
+    pub base_tiles: u16,
+    /// 0=D24S8, 1=D24FS8 (per `xenos.h:404-408`).
+    pub format: u8,
+}
+
+/// Resolve source: either one of four color render targets or the depth RT.
+/// Packed into `RB_COPY_CONTROL.copy_src_select` (bits [2:0]): 0..=3 pick
+/// color0..3, 4 picks depth. Canary `registers.h:853`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ResolveSource {
+    Color(u8),
+    Depth,
+}
+
+/// Resolve rectangle in pixel coordinates at the destination resolution,
+/// 8-pixel aligned per Canary's `kResolveAlignmentPixels = 8`. MSAA scaling
+/// is kept separate — `sample_count_log2_x/y` tell the resolve how many
+/// samples to step per destination pixel.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct ResolveCoordinates {
+    pub x0: u32,
+    pub y0: u32,
+    pub width: u32,
+    pub height: u32,
+    /// 1 iff 4x MSAA (samples laid out 2x wider than pixels).
+    pub sample_count_log2_x: u32,
+    /// 1 iff 2x+ MSAA (samples laid out 2x taller than pixels).
+    pub sample_count_log2_y: u32,
+}
+
+/// Decoded resolve state — describes how a `TILE_FLUSH` event should copy
+/// EDRAM bytes to a guest-memory tiled texture. Canary equivalent:
+/// `draw_util::ResolveInfo` at `draw_util.h:627`. Bit-field layout in
+/// `RB_COPY_CONTROL / RB_COPY_DEST_INFO / RB_COPY_DEST_PITCH` comes from
+/// `registers.h:853-897`.
+#[derive(Debug, Clone, Copy)]
+pub struct ResolveInfo {
+    /// Which source RT (0..=3=color, 4=depth). Raw register bits.
+    pub copy_src_select: u8,
+    /// Sample selector for MSAA sources. See `xenos::CopySampleSelect`.
+    pub copy_sample_select: u8,
+    /// Enable clear of the source render target after the copy.
+    pub color_clear_enable: bool,
+    pub depth_clear_enable: bool,
+    /// 0 = raw tile copy (same format), 1 = convert to `copy_dest_format`.
+    /// 2 = constantOne, 3 = null (no copy).
+    pub copy_command: u8,
+    /// Guest-memory destination address, already masked to the 29-bit
+    /// Xenon physical range (`& 0x1FFF_FFFF`).
+    pub dest_base: u32,
+    /// Destination pitch in pixels (0..=16383). Byte pitch = pitch * bpp
+    /// after the caller pitch-aligns to `kStoragePitchHeightAlignmentBlocks
+    /// = 32`.
+    pub dest_pitch_pixels: u32,
+    pub dest_height_pixels: u32,
+    /// Destination format (`xenos::ColorFormat`, 6 bits).
+    pub dest_format: u8,
+    /// Byte-swap mode applied before the write (`xenos::Endian128`, 0..=5).
+    pub dest_endian: u8,
+    /// Signed [-32, 31] exponent bias applied during conversion.
+    pub dest_exp_bias: i8,
+    /// Decoded resolve source (color0..3 or depth).
+    pub source: ResolveSource,
+    /// 8-pixel-aligned resolve rectangle.
+    pub coords: ResolveCoordinates,
+    /// Source format: `ColorRenderTargetFormat` when color,
+    /// `DepthRenderTargetFormat` when depth.
+    pub source_format: u8,
+    /// EDRAM tile origin of the source RT (from `RB_COLOR_INFO.color_base`
+    /// or `RB_DEPTH_INFO.depth_base`, 11-bit mod 2048).
+    pub source_base_tiles: u16,
+    /// `GetSurfacePitchTiles(surface_pitch, msaa, is_64bpp)` — how many
+    /// 80-sample-wide tiles make up one EDRAM row.
+    pub surface_pitch_tiles: u32,
+    /// MSAA mode from `RB_SURFACE_INFO`.
+    pub msaa: crate::render_target_cache::MsaaSamples,
+    /// True iff the source color format is 64bpp (doubles EDRAM pitch/base).
+    pub source_is_64bpp: bool,
+    /// `RB_COLOR_CLEAR` — constant written into EDRAM when
+    /// `color_clear_enable` is set.
+    pub color_clear_value: u32,
+    /// `RB_COLOR_CLEAR_LO` — second 32-bit lane for 64bpp clear.
+    pub color_clear_value_lo: u32,
+    /// `RB_DEPTH_CLEAR` — constant written into EDRAM depth tiles on
+    /// `depth_clear_enable`.
+    pub depth_clear_value: u32,
+    /// `RB_COPY_DEST_INFO.copy_dest_array` — 2D (false) vs 3D/stacked (true).
+    pub copy_dest_array: bool,
+}
+
+/// `GetSurfacePitchTiles(pitch_pixels, msaa, is_64bpp)` — ported from
+/// `xenos.h:465-476`. Returns the number of 80-sample-wide EDRAM tiles
+/// that make up one row of a surface with `pitch_pixels`-pixel pitch.
+///
+/// At 4x MSAA samples span twice the pixel width, so the sample pitch
+/// doubles. 64bpp formats pack two EDRAM tiles per color value, so the
+/// effective tile pitch doubles again.
+#[inline]
+pub fn surface_pitch_tiles(
+    pitch_pixels: u32,
+    msaa: crate::render_target_cache::MsaaSamples,
+    is_64bpp: bool,
+) -> u32 {
+    use crate::render_target_cache::MsaaSamples;
+    const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80;
+    let pitch_samples = pitch_pixels << u32::from(msaa == MsaaSamples::X4);
+    let pitch_tiles = pitch_samples.div_ceil(EDRAM_TILE_WIDTH_SAMPLES);
+    pitch_tiles << u32::from(is_64bpp)
+}
+
+/// Canary `ColorRenderTargetFormat` is 64bpp iff its numeric value is one
+/// of {5, 7, 15} — i.e. `k_16_16_16_16`, `k_16_16_16_16_FLOAT`, or
+/// `k_32_32_FLOAT`. `xenos.h:297-317` + the enum's `IsColorRenderTarget
+/// Format64bpp` helper.
+#[inline]
+pub fn color_render_target_format_is_64bpp(fmt: u8) -> bool {
+    matches!(fmt, 5 | 7 | 15)
+}
+
+/// `kResolveAlignmentPixels` from Canary (`draw_util.cc:925` area).
+pub const RESOLVE_ALIGNMENT_PIXELS: u32 = 8;
+
+/// Clamp a raw resolve rectangle to the `PA_SC_WINDOW_SCISSOR_*` registers
+/// and align to the 8-pixel grid. Caller passes `i32` because the VF0
+/// derivation can produce negative bounding-box values; this helper clamps
+/// them to the non-negative window defined by the scissor.
+///
+/// Returns `(x0, y0, width, height)` in pixels, all non-negative, all
+/// 8-pixel-aligned, `width`/`height` already `>= 0`. Width/height of 0
+/// signals "empty resolve; skip".
+pub fn resolve_rect_apply_scissor_and_align_8(
+    rf: &RegisterFile,
+    x0_in: i32,
+    y0_in: i32,
+    x1_in: i32,
+    y1_in: i32,
+) -> (u32, u32, u32, u32) {
+    let tl = rf.read(reg::PA_SC_WINDOW_SCISSOR_TL);
+    let br = rf.read(reg::PA_SC_WINDOW_SCISSOR_BR);
+    let tl_x = (tl & 0x3FFF) as i32;
+    let tl_y = ((tl >> 16) & 0x3FFF) as i32;
+    let br_x = (br & 0x3FFF) as i32;
+    let br_y = ((br >> 16) & 0x3FFF) as i32;
+
+    // Clamp only when the scissor is a non-degenerate window; otherwise
+    // leave the input rect alone (Canary's `kResolveAlignmentPixels` will
+    // still 8-align it below).
+    let (mut x0, mut y0, mut x1, mut y1) = (x0_in, y0_in, x1_in, y1_in);
+    if br_x > tl_x && br_y > tl_y {
+        let clamp = |v: i32, lo: i32, hi: i32| v.max(lo).min(hi);
+        x0 = clamp(x0, tl_x, br_x);
+        y0 = clamp(y0, tl_y, br_y);
+        x1 = clamp(x1, tl_x, br_x);
+        y1 = clamp(y1, tl_y, br_y);
+    }
+    if x1 < x0 {
+        x1 = x0;
+    }
+    if y1 < y0 {
+        y1 = y0;
+    }
+
+    // 8-pixel align. Floor top-left; ceil bottom-right.
+    let align_mask = (RESOLVE_ALIGNMENT_PIXELS as i32) - 1;
+    x0 &= !align_mask;
+    y0 &= !align_mask;
+    x1 = (x1 + align_mask) & !align_mask;
+    y1 = (y1 + align_mask) & !align_mask;
+
+    let x0u = x0.max(0) as u32;
+    let y0u = y0.max(0) as u32;
+    let x1u = x1.max(0) as u32;
+    let y1u = y1.max(0) as u32;
+    (
+        x0u,
+        y0u,
+        x1u.saturating_sub(x0u),
+        y1u.saturating_sub(y0u),
+    )
+}
+
+/// Parse vertex fetch constant 0 (Canary `xe_gpu_vertex_fetch_t`,
+/// `xenos.h:1158-1172`) and derive the resolve bounding-box in pixel units.
+/// Returns `None` when the fetch isn't the 6-float vertex buffer the
+/// resolve shader expects (type != kVertex or size != 6).
+///
+/// This mirrors `draw_util.cc:950-1014` minus window-offset and half-pixel
+/// nudging — the pitfalls there are (a) handling endian via `GpuSwap` and
+/// (b) Fixed16p8 top-left rounding `(v + 127) >> 8`. Both are replicated.
+///
+/// The returned rect is in *pixel* coordinates, *pre-scissor-clamp* and
+/// *pre-alignment*. Caller feeds it through
+/// [`resolve_rect_apply_scissor_and_align_8`].
+pub fn vertex_fetch_0_rect(
+    rf: &RegisterFile,
+    mem: &dyn xenia_memory::access::MemoryAccess,
+) -> Option<(i32, i32, i32, i32)> {
+    const CONST_BASE_FETCH: u32 = 0x4800;
+    let dword_0 = rf.read(CONST_BASE_FETCH);
+    let dword_1 = rf.read(CONST_BASE_FETCH + 1);
+
+    // type:2 at bits [1:0]; kVertex = 3 per xenos.h:1147-1152.
+    let fetch_type = dword_0 & 0x3;
+    if fetch_type != 3 {
+        return None;
+    }
+    // size:24 at bits [25:2] of dword_1 — in dwords; expect 6 (3 × vec2).
+    let size = (dword_1 >> 2) & 0x00FF_FFFF;
+    if size != 6 {
+        return None;
+    }
+    // address:30 at bits [31:2] of dword_0 — in dwords.
+    let address_bytes = dword_0 & 0xFFFF_FFFC;
+    // endian:2 at bits [1:0] of dword_1 — xenos::Endian (kNone/k8in16/k8in32/k16in32).
+    let fetch_endian = (dword_1 & 0x3) as u8;
+
+    // Read 6 floats from guest memory. `mem.read_u32` stores BE bytes as a
+    // u32 value; to mirror Canary's "raw LE bytes → u32 → GpuSwap" we have
+    // to re-interpret the memory as LE (flipping what `read_u32` did).
+    let floats: [f32; 6] = std::array::from_fn(|i| {
+        let be_u32 = mem.read_u32(address_bytes.wrapping_add(i as u32 * 4));
+        // `be_u32` was composed from bytes `[b0,b1,b2,b3]` as
+        // `(b0<<24)|...|b3`. Canary reads those same bytes in host-LE,
+        // producing `(b3<<24)|...|b0`. That's `be_u32.swap_bytes()`.
+        let canary_le = be_u32.swap_bytes();
+        let swapped = gpu_swap_u32(canary_le, fetch_endian);
+        f32::from_bits(swapped)
+    });
+
+    // PA_SU_VTX_CNTL::pix_center: bit 0, 0 = kD3DZero (+0.5 half-pixel), 1 = kOpenGL (no offset).
+    // Register index 0x2083 per register_table.inc (PA_SU_VTX_CNTL).
+    const PA_SU_VTX_CNTL: u32 = 0x2083;
+    let half_pixel_offset = if rf.read(PA_SU_VTX_CNTL) & 1 == 0 {
+        0.5f32
+    } else {
+        0.0f32
+    };
+
+    // Convert each to Fixed16p8 (multiply by 256, round).
+    let fixed: [i32; 6] = std::array::from_fn(|i| {
+        ((floats[i] + half_pixel_offset) * 256.0).round() as i32
+    });
+
+    let x0 = fixed[0].min(fixed[2]).min(fixed[4]);
+    let y0 = fixed[1].min(fixed[3]).min(fixed[5]);
+    let x1 = fixed[0].max(fixed[2]).max(fixed[4]);
+    let y1 = fixed[1].max(fixed[3]).max(fixed[5]);
+
+    // Top-left rounding: `(v + 127) >> 8` for both corners.
+    let round = |v: i32| (v + 127) >> 8;
+    Some((round(x0), round(y0), round(x1), round(y1)))
+}
+
+/// Canary `GpuSwapInline` on a u32. Exposed here so the vertex-fetch path
+/// can apply the same byte-order transform Canary's `GpuSwap<float>` applies
+/// to vertex data. `xenos.h:1077-1114`.
+#[inline]
+fn gpu_swap_u32(value: u32, endian: u8) -> u32 {
+    match endian & 0x3 {
+        // kNone.
+        0 => value,
+        // k8in16: swap bytes within each 16-bit word.
+        1 => ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8),
+        // k8in32: full byte reversal.
+        2 => value.swap_bytes(),
+        // k16in32: swap 16-bit halves.
+        _ => value.rotate_left(16),
+    }
+}
+
+impl ResolveInfo {
+    /// Legacy entrypoint used when the caller already has a `DrawState`. It
+    /// fills only the narrow register bits that live in `DrawState` — the
+    /// wider coordinate / EDRAM fields require the full register file.
+    ///
+    /// Kept for tests that construct resolve decoders from captured draw
+    /// states. `from_register_file` is the canonical path.
+    pub fn from_draw_state(ds: &DrawState) -> Self {
+        use crate::render_target_cache::MsaaSamples;
+        let c = ds.rb_copy_control;
+        let p = ds.rb_copy_dest_pitch;
+        let i = ds.rb_copy_dest_info;
+        // Sign-extend the 6-bit exp_bias from `copy_dest_info[21:16]`.
+        let exp_raw = (i >> 16) & 0x3F;
+        let exp_sign = ((exp_raw & 0x20) != 0) as i8;
+        let exp_bias = (exp_raw as i8) - (exp_sign * 64);
+        let src_sel = (c & 0x7) as u8;
+        let source = if src_sel >= 4 {
+            ResolveSource::Depth
+        } else {
+            ResolveSource::Color(src_sel)
+        };
+        Self {
+            copy_src_select: src_sel,
+            copy_sample_select: ((c >> 4) & 0x7) as u8,
+            color_clear_enable: ((c >> 8) & 1) != 0,
+            depth_clear_enable: ((c >> 9) & 1) != 0,
+            copy_command: ((c >> 20) & 0x3) as u8,
+            dest_base: ds.rb_copy_dest_base & 0x1FFF_FFFF,
+            dest_pitch_pixels: p & 0x3FFF,
+            dest_height_pixels: (p >> 16) & 0x3FFF,
+            dest_format: ((i >> 7) & 0x3F) as u8,
+            dest_endian: (i & 0x7) as u8,
+            dest_exp_bias: exp_bias,
+            source,
+            coords: ResolveCoordinates::default(),
+            source_format: 0,
+            source_base_tiles: 0,
+            surface_pitch_tiles: 0,
+            msaa: MsaaSamples::X1,
+            source_is_64bpp: false,
+            color_clear_value: 0,
+            color_clear_value_lo: 0,
+            depth_clear_value: 0,
+            copy_dest_array: ((i >> 3) & 1) != 0,
+        }
+    }
+
+    /// Canonical resolve decoder — reads live register values and derives the
+    /// full rectangle / EDRAM layout. Mirrors canary `draw_util.cc:926-1318`
+    /// `GetResolveInfo` with the following simplifications (all scoped in
+    /// the landing plan and will be expanded as needs arise):
+    ///
+    /// * The rectangle is derived from the scissor window and
+    ///   `RB_COPY_DEST_PITCH` rather than fetched from vertex fetch 0.
+    ///   Sylpheed's splash uses a clear-resolve — there's no draw ahead
+    ///   of it — so vertex-fetch-derived geometry is not available.
+    /// * `copy_sample_select` is kept as-is; sample averaging for 2x/4x
+    ///   MSAA is not yet applied on the read side.
+    /// * `PA_SC_WINDOW_OFFSET` is not applied — not needed for Sylpheed
+    ///   and canary only applies it when `PA_SU_SC_MODE_CNTL.vtx_window
+    ///   _offset_enable` is set, which requires a live draw.
+    pub fn from_register_file(rf: &RegisterFile) -> Self {
+        use crate::render_target_cache::MsaaSamples;
+        let c = rf.read(reg::RB_COPY_CONTROL);
+        let i = rf.read(reg::RB_COPY_DEST_INFO);
+        let p = rf.read(reg::RB_COPY_DEST_PITCH);
+        let dest_base_raw = rf.read(reg::RB_COPY_DEST_BASE);
+
+        // Sign-extend 6-bit exp_bias from copy_dest_info[21:16].
+        let exp_raw = (i >> 16) & 0x3F;
+        let exp_sign = ((exp_raw & 0x20) != 0) as i8;
+        let exp_bias = (exp_raw as i8) - (exp_sign * 64);
+
+        let src_sel = (c & 0x7) as u8;
+        let source = if src_sel >= 4 {
+            ResolveSource::Depth
+        } else {
+            ResolveSource::Color(src_sel & 0x3)
+        };
+
+        let rb_surface_info = rf.read(reg::RB_SURFACE_INFO);
+        let surface_pitch_pixels = rb_surface_info & 0x3FFF;
+        let msaa = MsaaSamples::from_raw((rb_surface_info >> 16) & 0x3);
+
+        // Source format + base tiles depend on which RT we're reading.
+        let (source_format, source_base_tiles, source_is_64bpp) = match source {
+            ResolveSource::Color(idx) => {
+                let rb = match idx {
+                    0 => rf.read(reg::RB_COLOR_INFO_0),
+                    1 => rf.read(reg::RB_COLOR_INFO_1),
+                    2 => rf.read(reg::RB_COLOR_INFO_2),
+                    _ => rf.read(reg::RB_COLOR_INFO_3),
+                };
+                let fmt = ((rb >> 16) & 0xF) as u8;
+                let base = (rb & 0xFFF) as u16;
+                (fmt, base, color_render_target_format_is_64bpp(fmt))
+            }
+            ResolveSource::Depth => {
+                let rb = rf.read(reg::RB_DEPTH_INFO);
+                let fmt = ((rb >> 16) & 0x1) as u8;
+                let base = (rb & 0xFFF) as u16;
+                (fmt, base, false)
+            }
+        };
+
+        let pitch_tiles = surface_pitch_tiles(surface_pitch_pixels, msaa, source_is_64bpp);
+
+        // --- Rectangle derivation ---
+        // Default extent is (0, 0, dest_pitch, dest_height); subject to
+        // scissor clamp + 8-pixel alignment.
+        let dest_pitch = p & 0x3FFF;
+        let dest_height = (p >> 16) & 0x3FFF;
+        let coords_no_msaa = resolve_rect_apply_scissor_and_align_8(
+            rf,
+            0,
+            0,
+            dest_pitch as i32,
+            dest_height as i32,
+        );
+        let coords = ResolveCoordinates {
+            x0: coords_no_msaa.0,
+            y0: coords_no_msaa.1,
+            width: coords_no_msaa.2,
+            height: coords_no_msaa.3,
+            sample_count_log2_x: u32::from(msaa == MsaaSamples::X4),
+            sample_count_log2_y: u32::from(msaa != MsaaSamples::X1),
+        };
+
+        Self {
+            copy_src_select: src_sel,
+            copy_sample_select: ((c >> 4) & 0x7) as u8,
+            color_clear_enable: ((c >> 8) & 1) != 0,
+            depth_clear_enable: ((c >> 9) & 1) != 0,
+            copy_command: ((c >> 20) & 0x3) as u8,
+            dest_base: dest_base_raw & 0x1FFF_FFFF,
+            dest_pitch_pixels: dest_pitch,
+            dest_height_pixels: dest_height,
+            dest_format: ((i >> 7) & 0x3F) as u8,
+            dest_endian: (i & 0x7) as u8,
+            dest_exp_bias: exp_bias,
+            source,
+            coords,
+            source_format,
+            source_base_tiles,
+            surface_pitch_tiles: pitch_tiles,
+            msaa,
+            source_is_64bpp,
+            color_clear_value: rf.read(reg::RB_COLOR_CLEAR),
+            color_clear_value_lo: rf.read(reg::RB_COLOR_CLEAR_LO),
+            depth_clear_value: rf.read(reg::RB_DEPTH_CLEAR),
+            copy_dest_array: ((i >> 3) & 1) != 0,
+        }
+    }
+
+    /// Memory-aware variant: if vertex fetch 0 contains the D3D9-hack
+    /// "resolve rectangle" vertices (3 vec2 floats, Canary `draw_util.cc
+    /// :950-1014`), use its bounding box as the resolve extent. Falls back
+    /// to the scissor + `RB_COPY_DEST_PITCH/HEIGHT` rect when VF0 isn't a
+    /// 6-dword vertex buffer.
+    ///
+    /// Used from the live TILE_FLUSH path; tests can stick with
+    /// `from_register_file` when they don't want to program VF0.
+    pub fn from_register_file_and_memory(
+        rf: &RegisterFile,
+        mem: &dyn xenia_memory::access::MemoryAccess,
+    ) -> Self {
+        let mut info = Self::from_register_file(rf);
+        if let Some((x0, y0, x1, y1)) = vertex_fetch_0_rect(rf, mem) {
+            let (rx0, ry0, rw, rh) =
+                resolve_rect_apply_scissor_and_align_8(rf, x0, y0, x1, y1);
+            // Only override when the VF0 rect is non-empty — an empty VF0
+            // means the game hasn't set one up yet and we should keep the
+            // scissor+dest default.
+            if rw > 0 && rh > 0 {
+                info.coords.x0 = rx0;
+                info.coords.y0 = ry0;
+                info.coords.width = rw;
+                info.coords.height = rh;
+            }
+        }
+        info
+    }
+}
+
+/// Register indices from `xenia-canary/src/xenia/gpu/registers.h`. Only what
+/// the extractor reads is named here.
+pub mod reg {
+    pub const VGT_DRAW_INITIATOR: u32 = 0x2281;
+    pub const VGT_DMA_BASE: u32 = 0x2282;
+    pub const VGT_DMA_SIZE: u32 = 0x2283;
+    pub const PA_CL_VPORT_XSCALE: u32 = 0x210F;
+    pub const PA_CL_VPORT_XOFFSET: u32 = 0x2110;
+    pub const PA_CL_VPORT_YSCALE: u32 = 0x2111;
+    pub const PA_CL_VPORT_YOFFSET: u32 = 0x2112;
+    pub const PA_CL_VPORT_ZSCALE: u32 = 0x2113;
+    pub const PA_CL_VPORT_ZOFFSET: u32 = 0x2114;
+    pub const PA_SC_WINDOW_SCISSOR_TL: u32 = 0x200E;
+    pub const PA_SC_WINDOW_SCISSOR_BR: u32 = 0x200F;
+    pub const RB_MODECONTROL: u32 = 0x2208;
+    pub const RB_SURFACE_INFO: u32 = 0x2000;
+    pub const RB_COLOR_INFO_0: u32 = 0x2001;
+    pub const RB_COLOR_INFO_1: u32 = 0x2010;
+    pub const RB_COLOR_INFO_2: u32 = 0x2011;
+    pub const RB_COLOR_INFO_3: u32 = 0x2012;
+    pub const RB_DEPTH_INFO: u32 = 0x2002;
+    pub const RB_COLORCONTROL: u32 = 0x2202;
+    pub const RB_DEPTHCONTROL: u32 = 0x2200;
+    // P4 additions — per-RT blend + stencil + window offset + resolve dst.
+    pub const RB_BLENDCONTROL_0: u32 = 0x2201;
+    pub const RB_BLENDCONTROL_1: u32 = 0x2209;
+    pub const RB_BLENDCONTROL_2: u32 = 0x220A;
+    pub const RB_BLENDCONTROL_3: u32 = 0x220B;
+    pub const RB_STENCILREFMASK: u32 = 0x210D;
+    pub const RB_STENCILREFMASK_BF: u32 = 0x210C;
+    pub const PA_SC_WINDOW_OFFSET: u32 = 0x2080;
+    pub const RB_COPY_CONTROL: u32 = 0x2318;
+    pub const RB_COPY_DEST_BASE: u32 = 0x2319;
+    pub const RB_COPY_DEST_PITCH: u32 = 0x231A;
+    pub const RB_COPY_DEST_INFO: u32 = 0x231B;
+    pub const RB_DEPTH_CLEAR: u32 = 0x231D;
+    pub const RB_COLOR_CLEAR: u32 = 0x231E;
+    pub const RB_COLOR_CLEAR_LO: u32 = 0x231F;
+}
+
+/// Build a [`DrawState`] from a `VGT_DRAW_INITIATOR` value + the current
+/// register file. `extra_dma_base`/`extra_dma_size` can override the
+/// DMA fields if the caller has them from the PM4 packet payload (canary
+/// passes them inline with `DRAW_INDX`).
+pub fn extract(
+    register_file: &RegisterFile,
+    vgt_draw_initiator: u32,
+    dma_base: Option<u32>,
+    dma_size: Option<u32>,
+) -> DrawState {
+    // `VGT_DRAW_INITIATOR` bit layout (per canary):
+    //   [5:0]   prim_type
+    //   [7:6]   source_select (0=DMA, 1=immediate, 2=auto)
+    //   [8]     index_size (0=16-bit, 1=32-bit)
+    //   [31:16] num_indices
+    let prim_bits = vgt_draw_initiator & 0x3F;
+    let source_select = (vgt_draw_initiator >> 6) & 0x3;
+    let index_size_bit = (vgt_draw_initiator >> 8) & 0x1;
+    let num_indices = (vgt_draw_initiator >> 16) & 0xFFFF;
+    let index_size = if index_size_bit == 0 {
+        IndexSize::Sixteen
+    } else {
+        IndexSize::ThirtyTwo
+    };
+
+    let index_source = match source_select {
+        0 => IndexSource::Dma {
+            base_address: dma_base.unwrap_or_else(|| register_file.read(reg::VGT_DMA_BASE)),
+            size_dwords: dma_size.unwrap_or_else(|| register_file.read(reg::VGT_DMA_SIZE)),
+            index_size,
+        },
+        1 => IndexSource::Immediate { index_size },
+        _ => IndexSource::AutoIndex,
+    };
+
+    let f = |r: u32| f32::from_bits(register_file.read(r));
+    let viewport = Viewport {
+        scale_x: f(reg::PA_CL_VPORT_XSCALE),
+        scale_y: f(reg::PA_CL_VPORT_YSCALE),
+        scale_z: f(reg::PA_CL_VPORT_ZSCALE),
+        offset_x: f(reg::PA_CL_VPORT_XOFFSET),
+        offset_y: f(reg::PA_CL_VPORT_YOFFSET),
+        offset_z: f(reg::PA_CL_VPORT_ZOFFSET),
+    };
+
+    let tl = register_file.read(reg::PA_SC_WINDOW_SCISSOR_TL);
+    let br = register_file.read(reg::PA_SC_WINDOW_SCISSOR_BR);
+    let scissor = Scissor {
+        tl_x: (tl & 0x7FFF) as u16,
+        tl_y: ((tl >> 16) & 0x7FFF) as u16,
+        br_x: (br & 0x7FFF) as u16,
+        br_y: ((br >> 16) & 0x7FFF) as u16,
+    };
+
+    let rb_modecontrol = register_file.read(reg::RB_MODECONTROL);
+    let color_mask = rb_modecontrol & 0xF;
+    let ci = |reg: u32, present: bool| {
+        if !present {
+            return None;
+        }
+        let raw = register_file.read(reg);
+        Some(ColorTargetInfo {
+            base_tiles: (raw & 0xFFF) as u16,
+            format: ((raw >> 16) & 0xF) as u8,
+        })
+    };
+    let color_info = [
+        ci(reg::RB_COLOR_INFO_0, (color_mask & 0x1) != 0),
+        ci(reg::RB_COLOR_INFO_1, (color_mask & 0x2) != 0),
+        ci(reg::RB_COLOR_INFO_2, (color_mask & 0x4) != 0),
+        ci(reg::RB_COLOR_INFO_3, (color_mask & 0x8) != 0),
+    ];
+    let depth_raw = register_file.read(reg::RB_DEPTH_INFO);
+    // Depth-surface "present" = the RB_MODECONTROL depth-enable bit at bit 4.
+    let depth_present = (rb_modecontrol & 0x10) != 0;
+    let depth_info = if depth_present {
+        Some(DepthTargetInfo {
+            base_tiles: (depth_raw & 0xFFF) as u16,
+            format: ((depth_raw >> 16) & 0x1) as u8,
+        })
+    } else {
+        None
+    };
+
+    DrawState {
+        primitive: PrimitiveType::from_bits(prim_bits),
+        vertex_count: num_indices,
+        index_source,
+        viewport,
+        scissor,
+        color_info,
+        depth_info,
+        rb_modecontrol,
+        rb_colorcontrol: register_file.read(reg::RB_COLORCONTROL),
+        rb_depthcontrol: register_file.read(reg::RB_DEPTHCONTROL),
+        rb_blendcontrol: [
+            register_file.read(reg::RB_BLENDCONTROL_0),
+            register_file.read(reg::RB_BLENDCONTROL_1),
+            register_file.read(reg::RB_BLENDCONTROL_2),
+            register_file.read(reg::RB_BLENDCONTROL_3),
+        ],
+        rb_stencilrefmask: register_file.read(reg::RB_STENCILREFMASK),
+        rb_stencilrefmask_bf: register_file.read(reg::RB_STENCILREFMASK_BF),
+        pa_sc_window_offset: register_file.read(reg::PA_SC_WINDOW_OFFSET),
+        rb_copy_control: register_file.read(reg::RB_COPY_CONTROL),
+        rb_copy_dest_base: register_file.read(reg::RB_COPY_DEST_BASE),
+        rb_copy_dest_pitch: register_file.read(reg::RB_COPY_DEST_PITCH),
+        rb_copy_dest_info: register_file.read(reg::RB_COPY_DEST_INFO),
+        // P3b M1: the kernel-side caller is expected to populate these
+        // via `DrawState { ..extract(...), vs_blob_key, ps_blob_key }` so
+        // the pure-register extraction stays decoupled from `GpuSystem`
+        // state. Default to None so a bare `extract()` stays valid for
+        // unit tests.
+        vs_blob_key: None,
+        ps_blob_key: None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn rf() -> RegisterFile {
+        RegisterFile::new()
+    }
+
+    #[test]
+    fn extract_basic_triangle_list_no_rt() {
+        let rf = rf();
+        // prim_type=4 (TriangleList), source=2 (auto), num_indices=6
+        let vgt = (6u32 << 16) | (2 << 6) | 4;
+        let ds = extract(&rf, vgt, None, None);
+        assert_eq!(ds.primitive, PrimitiveType::TriangleList);
+        assert_eq!(ds.vertex_count, 6);
+        assert!(matches!(ds.index_source, IndexSource::AutoIndex));
+        assert!(ds.color_info.iter().all(|c| c.is_none()));
+        assert!(ds.depth_info.is_none());
+    }
+
+    #[test]
+    fn extract_dma_indices_uses_override() {
+        let rf = rf();
+        let vgt = (3u32 << 16) | (0 << 6) | 4; // prim=TriList, source=DMA
+        let ds = extract(&rf, vgt, Some(0xDEAD_0000), Some(6));
+        match ds.index_source {
+            IndexSource::Dma {
+                base_address,
+                size_dwords,
+                index_size,
+            } => {
+                assert_eq!(base_address, 0xDEAD_0000);
+                assert_eq!(size_dwords, 6);
+                assert_eq!(index_size, IndexSize::Sixteen);
+            }
+            other => panic!("expected Dma, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn color_and_depth_enabled_bits_are_honored() {
+        let mut rf = rf();
+        // rb_modecontrol: color0 + depth enabled (bit0 + bit4)
+        rf.write(reg::RB_MODECONTROL, 0x11);
+        rf.write(reg::RB_COLOR_INFO_0, (2 << 16) | 0x64); // format=2, tile=0x64
+        rf.write(reg::RB_DEPTH_INFO, (1 << 16) | 0x32);
+        let ds = extract(&rf, 4, None, None);
+        let c = ds.color_info[0].unwrap();
+        assert_eq!(c.format, 2);
+        assert_eq!(c.base_tiles, 0x64);
+        let d = ds.depth_info.unwrap();
+        assert_eq!(d.format, 1);
+        assert_eq!(d.base_tiles, 0x32);
+    }
+
+    /// `RB_COPY_DEST_BASE` is a raw 32-bit register, but a Xenon physical
+    /// address is 29-bit (`& 0x1FFF_FFFF`). `ResolveInfo::from_register_file`
+    /// must mask before writes to prevent out-of-range memory accesses.
+    #[test]
+    fn resolve_info_masks_dest_base_to_physical() {
+        let mut rf = rf();
+        rf.write(reg::RB_COPY_DEST_BASE, 0xDEAD_BEEF);
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.dest_base, 0x1EAD_BEEF);
+    }
+
+    /// Scissor ∩ (0, 0, dest_pitch, dest_height), then 8-pixel-aligned per
+    /// Canary `kResolveAlignmentPixels`. Verify that the scissor actually
+    /// tightens the rect (not just degenerates it).
+    #[test]
+    fn resolve_info_derives_8px_aligned_rect_from_scissor_and_dest_pitch() {
+        let mut rf = rf();
+        // Dest pitch/height 1280×720; scissor (5, 5) -> (1000, 717).
+        rf.write(reg::RB_COPY_DEST_PITCH, (720u32 << 16) | 1280u32);
+        rf.write(reg::PA_SC_WINDOW_SCISSOR_TL, (5u32 << 16) | 5u32);
+        rf.write(reg::PA_SC_WINDOW_SCISSOR_BR, (717u32 << 16) | 1000u32);
+        let info = ResolveInfo::from_register_file(&rf);
+        // x0 floors to 0 (was 5 -> &!7 = 0), y0 same.
+        // x1 = min(1280, 1000) = 1000; ceil-to-8 = 1000. y1 = min(720, 717) = 717, ceil = 720.
+        assert_eq!(info.coords.x0, 0);
+        assert_eq!(info.coords.y0, 0);
+        assert_eq!(info.coords.width, 1000);
+        assert_eq!(info.coords.height, 720);
+    }
+
+    /// Non-degenerate scissor outside `dest_pitch/height` clamps to the
+    /// destination extent.
+    #[test]
+    fn resolve_info_scissor_cannot_widen_past_dest() {
+        let mut rf = rf();
+        rf.write(reg::RB_COPY_DEST_PITCH, (16u32 << 16) | 16u32);
+        rf.write(reg::PA_SC_WINDOW_SCISSOR_BR, (1000u32 << 16) | 1000u32);
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.coords.width, 16);
+        assert_eq!(info.coords.height, 16);
+    }
+
+    /// Source decoding: `copy_src_select >= 4` → depth; otherwise Color(idx).
+    #[test]
+    fn resolve_info_decodes_source_select() {
+        let mut rf = rf();
+        rf.write(reg::RB_COPY_CONTROL, 2); // src_select = 2 (color2)
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.source, ResolveSource::Color(2));
+        assert_eq!(info.copy_src_select, 2);
+
+        rf.write(reg::RB_COPY_CONTROL, 4); // src_select = 4 -> depth
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.source, ResolveSource::Depth);
+    }
+
+    /// `copy_dest_info` fields: endian (bits 2:0), format (bits 12:7),
+    /// exp_bias (bits 21:16, signed 6-bit), array (bit 3).
+    #[test]
+    fn resolve_info_decodes_copy_dest_info_fields() {
+        let mut rf = rf();
+        // endian=2 (k8in32), format=6 (k_8_8_8_8), exp_bias=-1 (0x3F), array=1
+        let val = 2u32 | (1u32 << 3) | (6u32 << 7) | (0x3Fu32 << 16);
+        rf.write(reg::RB_COPY_DEST_INFO, val);
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.dest_endian, 2);
+        assert_eq!(info.dest_format, 6);
+        assert_eq!(info.dest_exp_bias, -1);
+        assert!(info.copy_dest_array);
+    }
+
+    /// Positive and negative exp_bias round-trip through the 6-bit
+    /// sign-extension.
+    #[test]
+    fn resolve_info_exp_bias_sign_extends() {
+        let mut rf = rf();
+        rf.write(reg::RB_COPY_DEST_INFO, 1u32 << 16); // exp_bias = +1
+        assert_eq!(ResolveInfo::from_register_file(&rf).dest_exp_bias, 1);
+        rf.write(reg::RB_COPY_DEST_INFO, 0x20u32 << 16); // exp_bias = -32
+        assert_eq!(ResolveInfo::from_register_file(&rf).dest_exp_bias, -32);
+        rf.write(reg::RB_COPY_DEST_INFO, 0x1Fu32 << 16); // exp_bias = +31
+        assert_eq!(ResolveInfo::from_register_file(&rf).dest_exp_bias, 31);
+    }
+
+    /// `RB_SURFACE_INFO`: surface_pitch (bits 13:0) and msaa_samples (bits 17:16)
+    /// feed `surface_pitch_tiles`. 1280 px divides by 80 exactly → 16 tiles
+    /// at 1x MSAA / 32bpp; 4x MSAA doubles the sample pitch.
+    #[test]
+    fn resolve_info_computes_surface_pitch_tiles() {
+        let mut rf = rf();
+        rf.write(reg::RB_COPY_CONTROL, 0); // color0
+        rf.write(reg::RB_COLOR_INFO_0, 0u32 << 16); // k_8_8_8_8 -> 32bpp
+        rf.write(reg::RB_SURFACE_INFO, 1280); // msaa=1x, pitch=1280
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.surface_pitch_tiles, 16);
+        assert!(!info.source_is_64bpp);
+
+        // 4x MSAA widens the sample pitch by 2x.
+        rf.write(reg::RB_SURFACE_INFO, 1280 | (2u32 << 16));
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.surface_pitch_tiles, 32);
+
+        // Non-aligned pitch rounds up.
+        rf.write(reg::RB_SURFACE_INFO, 1281);
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.surface_pitch_tiles, 17);
+    }
+
+    /// `color_render_target_format_is_64bpp` matches the 64bpp enum values
+    /// in `xenos::ColorRenderTargetFormat`: k_16_16_16_16 (5),
+    /// k_16_16_16_16_FLOAT (7), k_32_32_FLOAT (15).
+    #[test]
+    fn color_format_64bpp_table_is_correct() {
+        assert!(!color_render_target_format_is_64bpp(0));
+        assert!(!color_render_target_format_is_64bpp(4));
+        assert!(color_render_target_format_is_64bpp(5));
+        assert!(!color_render_target_format_is_64bpp(6));
+        assert!(color_render_target_format_is_64bpp(7));
+        assert!(!color_render_target_format_is_64bpp(14));
+        assert!(color_render_target_format_is_64bpp(15));
+    }
+
+    /// `surface_pitch_tiles` helper: exact arithmetic including the 64bpp
+    /// doubling. `xenos.h:465-476`.
+    #[test]
+    fn surface_pitch_tiles_matches_canary_helper() {
+        use crate::render_target_cache::MsaaSamples;
+        // 80 px, 1x, 32bpp -> 1 tile exactly.
+        assert_eq!(surface_pitch_tiles(80, MsaaSamples::X1, false), 1);
+        // 81 px, 1x, 32bpp -> 2 tiles (round up).
+        assert_eq!(surface_pitch_tiles(81, MsaaSamples::X1, false), 2);
+        // 80 px, 1x, 64bpp -> 2 tiles (64bpp doubles).
+        assert_eq!(surface_pitch_tiles(80, MsaaSamples::X1, true), 2);
+        // 80 px, 2x, 32bpp -> 1 tile (2x MSAA doesn't widen X).
+        assert_eq!(surface_pitch_tiles(80, MsaaSamples::X2, false), 1);
+        // 80 px, 4x, 32bpp -> 2 tiles (4x MSAA widens X 2x).
+        assert_eq!(surface_pitch_tiles(80, MsaaSamples::X4, false), 2);
+        // 80 px, 4x, 64bpp -> 4 tiles.
+        assert_eq!(surface_pitch_tiles(80, MsaaSamples::X4, true), 4);
+    }
+
+    /// The color-source branch reads from `RB_COLOR_INFO_<idx>` based on
+    /// `copy_src_select`. Verify that index-3 color targets are addressed.
+    #[test]
+    fn resolve_info_color_source_selects_correct_color_info() {
+        let mut rf = rf();
+        rf.write(reg::RB_COPY_CONTROL, 3); // color3
+        rf.write(reg::RB_COLOR_INFO_3, (5u32 << 16) | 0x123); // k_16_16_16_16, base=0x123
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.source, ResolveSource::Color(3));
+        assert_eq!(info.source_format, 5);
+        assert_eq!(info.source_base_tiles, 0x123);
+        assert!(info.source_is_64bpp);
+    }
+
+    /// Depth-source branch reads from `RB_DEPTH_INFO` and parses its
+    /// 1-bit format.
+    #[test]
+    fn resolve_info_depth_source_reads_depth_info() {
+        let mut rf = rf();
+        rf.write(reg::RB_COPY_CONTROL, 4); // depth
+        rf.write(reg::RB_DEPTH_INFO, (1u32 << 16) | 0x55); // kD24FS8, base=0x55
+        let info = ResolveInfo::from_register_file(&rf);
+        assert_eq!(info.source, ResolveSource::Depth);
+        assert_eq!(info.source_format, 1);
+        assert_eq!(info.source_base_tiles, 0x55);
+        assert!(!info.source_is_64bpp); // depth always 32bpp
+    }
+
+    // ---- Vertex fetch 0 rectangle tests -------------------------------
+
+    /// Helper: seed a triangle covering the rectangle `(x0, y0) → (x1, y1)`
+    /// into guest memory at `vb_addr` and program VF0 to read 6 dwords
+    /// from it with endian = k8in32 (the standard D3D-vertex-buffer case).
+    fn seed_vertex_fetch_0(
+        rf: &mut RegisterFile,
+        mem: &xenia_memory::GuestMemory,
+        vb_addr: u32,
+        x0: f32,
+        y0: f32,
+        x1: f32,
+        y1: f32,
+    ) {
+        use xenia_memory::MemoryAccess;
+        // Three (x, y) float pairs covering the rect — exactly the D3D9
+        // resolve triangle layout Canary expects.
+        //   (x0, y0), (x1, y0), (x0, y1)
+        let floats = [x0, y0, x1, y0, x0, y1];
+        for (i, f) in floats.iter().enumerate() {
+            // Write float as BE (PPC `stfs` semantics). `mem.write_u32`
+            // already stores BE bytes; pass the raw u32 bit pattern.
+            mem.write_u32(
+                vb_addr + i as u32 * 4,
+                f.to_bits(),
+            );
+        }
+
+        // VF0 dword 0: address (bits 31:2, in dwords) + type (bits 1:0 = 3).
+        let addr_dwords = vb_addr / 4;
+        let dword_0 = (addr_dwords << 2) | 3;
+        // VF0 dword 1: size (bits 25:2 = 6) + endian (bits 1:0 = 2 = k8in32).
+        let dword_1 = (6u32 << 2) | 2;
+        rf.write(0x4800, dword_0);
+        rf.write(0x4801, dword_1);
+    }
+
+    fn fresh_mem_for_vf0() -> xenia_memory::GuestMemory {
+        use xenia_memory::page_table::MemoryProtect;
+        let mut mem = xenia_memory::GuestMemory::new().expect("guest memory");
+        mem.alloc(
+            0x5000_0000,
+            0x1_0000,
+            MemoryProtect::READ | MemoryProtect::WRITE,
+        )
+        .expect("alloc");
+        mem
+    }
+
+    #[test]
+    fn vf0_rect_returns_none_when_no_vertex_buffer() {
+        let rf = rf();
+        let mem = fresh_mem_for_vf0();
+        assert!(vertex_fetch_0_rect(&rf, &mem).is_none());
+    }
+
+    #[test]
+    fn vf0_rect_returns_none_for_wrong_size() {
+        let mut rf = rf();
+        let mem = fresh_mem_for_vf0();
+        // type=3 (kVertex), size=4 (wrong — should be 6), endian=2.
+        rf.write(0x4800, (0x5000_0000u32) | 3);
+        rf.write(0x4801, (4u32 << 2) | 2);
+        assert!(vertex_fetch_0_rect(&rf, &mem).is_none());
+    }
+
+    #[test]
+    fn vf0_rect_derives_rectangle_from_three_vertices() {
+        let mut rf = rf();
+        let mut mem = fresh_mem_for_vf0();
+        // D3D9 pixel center: +0.5 half-pixel offset applied before Fixed16p8.
+        // Leave PA_SU_VTX_CNTL at 0 (kD3DZero).
+        // Triangle at (0, 0) → (100, 50) → vertex 2 = (0, 50).
+        seed_vertex_fetch_0(&mut rf, &mut mem, 0x5000_0000, 0.0, 0.0, 100.0, 50.0);
+
+        let (x0, y0, x1, y1) = vertex_fetch_0_rect(&rf, &mem).expect("VF0 present");
+        // (0 + 0.5) * 256 = 128. (128 + 127) >> 8 = 0. So x0/y0 = 0.
+        // (100 + 0.5) * 256 = 25728. (25728 + 127) >> 8 = 100.
+        // (50 + 0.5) * 256 = 12928. (12928 + 127) >> 8 = 50.
+        assert_eq!(x0, 0);
+        assert_eq!(y0, 0);
+        assert_eq!(x1, 100);
+        assert_eq!(y1, 50);
+    }
+
+    #[test]
+    fn from_register_file_and_memory_prefers_vf0_rect() {
+        let mut rf = rf();
+        let mut mem = fresh_mem_for_vf0();
+        // Without VF0: dest_pitch/height defaults produce (0, 0, 1280, 720).
+        rf.write(reg::RB_COPY_DEST_PITCH, (720u32 << 16) | 1280u32);
+        // With VF0 pointing at a 256×128 triangle, override to that.
+        seed_vertex_fetch_0(&mut rf, &mut mem, 0x5000_0000, 0.0, 0.0, 256.0, 128.0);
+
+        let info = ResolveInfo::from_register_file_and_memory(&rf, &mem);
+        assert_eq!(info.coords.x0, 0);
+        assert_eq!(info.coords.y0, 0);
+        assert_eq!(info.coords.width, 256);
+        assert_eq!(info.coords.height, 128);
+    }
+
+    /// If VF0 is absent, fall back to the scissor+dest default.
+    #[test]
+    fn from_register_file_and_memory_falls_back_without_vf0() {
+        let mut rf = rf();
+        let mem = fresh_mem_for_vf0();
+        rf.write(reg::RB_COPY_DEST_PITCH, (720u32 << 16) | 1280u32);
+        let info = ResolveInfo::from_register_file_and_memory(&rf, &mem);
+        assert_eq!(info.coords.width, 1280);
+        assert_eq!(info.coords.height, 720);
+    }
+
+    /// `resolve_rect_apply_scissor_and_align_8` with no scissor just
+    /// 8-aligns.
+    #[test]
+    fn scissor_helper_8_aligns_with_no_scissor() {
+        let rf = rf();
+        let (x0, y0, w, h) = resolve_rect_apply_scissor_and_align_8(&rf, 5, 5, 1001, 17);
+        assert_eq!(x0, 0);
+        assert_eq!(y0, 0);
+        // 1001 ceil-to-8 = 1008; 17 ceil-to-8 = 24.
+        assert_eq!(w, 1008);
+        assert_eq!(h, 24);
+    }
+
+    /// Negative bounding-box (VF0 can produce these) clamps to the scissor
+    /// top-left without going below zero.
+    #[test]
+    fn scissor_helper_clamps_negative_to_zero() {
+        let mut rf = rf();
+        // Small scissor at (0,0)..(128, 64).
+        rf.write(reg::PA_SC_WINDOW_SCISSOR_BR, (64u32 << 16) | 128u32);
+        let (x0, y0, w, h) = resolve_rect_apply_scissor_and_align_8(&rf, -50, -50, 80, 32);
+        assert_eq!(x0, 0);
+        assert_eq!(y0, 0);
+        // x1 clamped from 80 -> 80, ceil8 -> 80. y1 32 -> 32.
+        assert_eq!(w, 80);
+        assert_eq!(h, 32);
+    }
+}
diff --git a/crates/xenia-gpu/src/edram.rs b/crates/xenia-gpu/src/edram.rs
new file mode 100644
index 0000000..720569a
--- /dev/null
+++ b/crates/xenia-gpu/src/edram.rs
@@ -0,0 +1,506 @@
+//! CPU-side shadow of the Xenos GPU's 10 MiB EDRAM.
+//!
+//! The real console has 10 MiB of embedded DRAM organised as 2048 tiles,
+//! each 80 × 16 samples wide at 32 bits per sample (`xenos.h:223-285`,
+//! `kEdramTileCount = 2048`). 64-bpp formats pack two adjacent EDRAM tiles
+//! per color value.
+//!
+//! xenia-rs does not currently render through a real EDRAM (host draws go
+//! straight to wgpu attachments), but the resolve path still needs a
+//! concrete byte source. We keep a linear 10 MiB `Vec<u8>` here so:
+//!
+//! * clear-resolves can paint `RB_COLOR_CLEAR` / `RB_DEPTH_CLEAR` into the
+//!   source tiles, which the resolve loop then copies into guest memory
+//!   (this is the Sylpheed-first-pixels path);
+//! * future host→EDRAM readback code has a place to deposit pixels without
+//!   touching the resolve API.
+//!
+//! Byte layout inside one tile: row-major, `80 * 16 * bpp` bytes. At 32bpp,
+//! offset `= y * 80 * 4 + x * 4` from the tile base. Samples are stored in
+//! native-u32 byte order; any Xenon big-endian vs little-endian shuffling
+//! happens at the resolve write boundary, not inside EDRAM.
+//!
+//! Indexing wraps mod 2048 (`XE_GPU_REGISTER` `RB_COLOR_INFO.color_base` is
+//! 11-bit). Canary relies on this wraparound for tall surfaces that
+//! exceed the 10 MiB region.
+
+/// Number of tiles in EDRAM. `xenos::kEdramTileCount`.
+pub const EDRAM_TILE_COUNT: u32 = 2048;
+
+/// Samples per tile along X. `xenos::kEdramTileWidthSamples`.
+pub const EDRAM_TILE_WIDTH_SAMPLES: u32 = 80;
+
+/// Samples per tile along Y. `xenos::kEdramTileHeightSamples`.
+pub const EDRAM_TILE_HEIGHT_SAMPLES: u32 = 16;
+
+/// Bytes per tile at 32bpp: 80 × 16 × 4 = 5120.
+pub const EDRAM_TILE_BYTES_32BPP: u32 =
+    EDRAM_TILE_WIDTH_SAMPLES * EDRAM_TILE_HEIGHT_SAMPLES * 4;
+
+/// Bytes per tile at 64bpp: 80 × 16 × 8 = 10_240 (two adjacent 32bpp tiles).
+pub const EDRAM_TILE_BYTES_64BPP: u32 = EDRAM_TILE_BYTES_32BPP * 2;
+
+/// Total EDRAM size in bytes: 2048 × 5120 = 10_485_760 (exactly 10 MiB).
+pub const EDRAM_SIZE_BYTES: usize = (EDRAM_TILE_COUNT * EDRAM_TILE_BYTES_32BPP) as usize;
+
+/// 10 MiB shadow of the console's EDRAM. Owned by `GpuSystem` and lives for
+/// the lifetime of the GPU; no per-frame allocation.
+pub struct ShadowEdram {
+    bytes: Vec<u8>,
+}
+
+impl Default for ShadowEdram {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ShadowEdram {
+    pub fn new() -> Self {
+        Self {
+            bytes: vec![0u8; EDRAM_SIZE_BYTES],
+        }
+    }
+
+    /// Raw byte offset of a tile within the shadow buffer, wrapped mod 2048.
+    #[inline]
+    fn tile_byte_offset(tile_index: u32) -> usize {
+        ((tile_index % EDRAM_TILE_COUNT) * EDRAM_TILE_BYTES_32BPP) as usize
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.bytes
+    }
+
+    pub fn tile(&self, tile_index: u32) -> &[u8] {
+        let off = Self::tile_byte_offset(tile_index);
+        &self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
+    }
+
+    pub fn tile_mut(&mut self, tile_index: u32) -> &mut [u8] {
+        let off = Self::tile_byte_offset(tile_index);
+        &mut self.bytes[off..off + EDRAM_TILE_BYTES_32BPP as usize]
+    }
+
+    /// Sample-space byte offset within the shadow buffer for one 32bpp
+    /// sample at `(x_samples, y_samples)` in a surface whose EDRAM origin
+    /// is `base_tiles` and whose row pitch is `pitch_tiles` 32bpp tiles.
+    ///
+    /// Tile layout: a surface of pitch `P` tiles is laid out as a row of
+    /// `P` tiles followed by the next 16-sample-tall row, etc. Sample
+    /// `(x, y)` lives in tile `(y/16)*P + (x/80)`, at row `y % 16` and
+    /// column `x % 80` within that tile.
+    #[inline]
+    fn sample_offset_32bpp(base_tiles: u16, pitch_tiles: u32, x: u32, y: u32) -> Option<usize> {
+        if pitch_tiles == 0 {
+            return None;
+        }
+        let tile_row = y / EDRAM_TILE_HEIGHT_SAMPLES;
+        let tile_col = x / EDRAM_TILE_WIDTH_SAMPLES;
+        let within_y = y % EDRAM_TILE_HEIGHT_SAMPLES;
+        let within_x = x % EDRAM_TILE_WIDTH_SAMPLES;
+        let tile_index =
+            (base_tiles as u32).wrapping_add(tile_row * pitch_tiles + tile_col);
+        let off = Self::tile_byte_offset(tile_index)
+            + (within_y * EDRAM_TILE_WIDTH_SAMPLES * 4 + within_x * 4) as usize;
+        Some(off)
+    }
+
+    /// Fill a `(w × h)`-sample rectangle at `(x, y)` with a constant 32bpp
+    /// pattern. Coordinates are in *sample space* (already scaled through
+    /// `sample_count_log2_x/y` for MSAA). Wraps mod 2048 tiles via
+    /// `tile_byte_offset`.
+    ///
+    /// The pattern is written as host-native little-endian bytes — the
+    /// endian swap in [`crate::resolve::apply_endian_128`] converts to the
+    /// byte order expected by the destination.
+    #[allow(clippy::too_many_arguments)]
+    pub fn fill_rect_32bpp(
+        &mut self,
+        base_tiles: u16,
+        pitch_tiles: u32,
+        x: u32,
+        y: u32,
+        w: u32,
+        h: u32,
+        pattern: u32,
+    ) {
+        if w == 0 || h == 0 {
+            return;
+        }
+        let le = pattern.to_le_bytes();
+        for dy in 0..h {
+            for dx in 0..w {
+                if let Some(off) = Self::sample_offset_32bpp(
+                    base_tiles,
+                    pitch_tiles,
+                    x + dx,
+                    y + dy,
+                ) && off + 4 <= self.bytes.len()
+                {
+                    self.bytes[off..off + 4].copy_from_slice(&le);
+                }
+            }
+        }
+    }
+
+    /// Read one 32bpp sample at `(x, y)` in sample coordinates. Returns 0
+    /// if the surface pitch is zero (degenerate; caller should skip the
+    /// resolve).
+    pub fn read_sample_32bpp(
+        &self,
+        base_tiles: u16,
+        pitch_tiles: u32,
+        x: u32,
+        y: u32,
+    ) -> u32 {
+        match Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y) {
+            Some(off) if off + 4 <= self.bytes.len() => u32::from_le_bytes([
+                self.bytes[off],
+                self.bytes[off + 1],
+                self.bytes[off + 2],
+                self.bytes[off + 3],
+            ]),
+            _ => 0,
+        }
+    }
+
+    /// Write one 32bpp sample at `(x, y)` in sample coordinates. Mirror of
+    /// [`Self::read_sample_32bpp`]. Used by the wgpu→ShadowEdram readback
+    /// retile path and unit tests.
+    pub fn write_sample_32bpp(
+        &mut self,
+        base_tiles: u16,
+        pitch_tiles: u32,
+        x: u32,
+        y: u32,
+        sample: u32,
+    ) {
+        if let Some(off) = Self::sample_offset_32bpp(base_tiles, pitch_tiles, x, y)
+            && off + 4 <= self.bytes.len()
+        {
+            self.bytes[off..off + 4].copy_from_slice(&sample.to_le_bytes());
+        }
+    }
+
+    /// Bulk write a `(w × h)`-sample rectangle at `(x, y)` from a row-major
+    /// linear `samples` buffer. The buffer length must be at least `w * h`;
+    /// extra entries are ignored. Order: `samples[dy * w + dx]` lands at
+    /// (x + dx, y + dy). This is the format the wgpu→ShadowEdram readback
+    /// path uses after stripping wgpu's 256-byte row alignment.
+    #[allow(clippy::too_many_arguments)]
+    pub fn write_rect_32bpp(
+        &mut self,
+        base_tiles: u16,
+        pitch_tiles: u32,
+        x: u32,
+        y: u32,
+        w: u32,
+        h: u32,
+        samples: &[u32],
+    ) {
+        if w == 0 || h == 0 {
+            return;
+        }
+        let needed = (w as usize).saturating_mul(h as usize);
+        debug_assert!(samples.len() >= needed, "write_rect_32bpp: samples too short");
+        for dy in 0..h {
+            let row_base = (dy as usize) * (w as usize);
+            for dx in 0..w {
+                let idx = row_base + dx as usize;
+                if idx >= samples.len() {
+                    return;
+                }
+                self.write_sample_32bpp(base_tiles, pitch_tiles, x + dx, y + dy, samples[idx]);
+            }
+        }
+    }
+
+    // --- 64bpp helpers ----------------------------------------------------
+    //
+    // 64bpp formats (`k_16_16_16_16`, `k_16_16_16_16_FLOAT`, `k_32_32_FLOAT`)
+    // occupy two adjacent EDRAM tiles per logical tile, doubling the row
+    // pitch in tiles. Per Canary `xenos.h:321-325 IsColorRenderTargetFormat64bpp`
+    // and `draw_util.cc:1260-1262` (`pitch_tiles = surface_pitch_tiles << is_64bpp`).
+    //
+    // Convention: callers pass the *32bpp-equivalent* `base_tiles` and
+    // `pitch_tiles_32bpp` (i.e. the `RB_COLOR_INFO.color_base` and
+    // `surface_pitch_tiles` decoded from registers). The 64bpp helpers
+    // multiply both by 2 internally so the lo/hi pair lands in adjacent
+    // tiles. `lo` is the lower-addressed 32bpp word; `hi` is the upper.
+
+    /// Read one 64bpp sample as `(lo, hi)` u32 pair. Doubled-tile addressing
+    /// per Canary's `is_64bpp` convention.
+    pub fn read_sample_64bpp(
+        &self,
+        base_tiles: u16,
+        pitch_tiles_32bpp: u32,
+        x: u32,
+        y: u32,
+    ) -> (u32, u32) {
+        let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
+        let base64 = (base_tiles as u32).saturating_mul(2) as u16;
+        let lo = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2), y);
+        let hi = self.read_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y);
+        (lo, hi)
+    }
+
+    /// Write one 64bpp sample as `(lo, hi)` u32 pair.
+    pub fn write_sample_64bpp(
+        &mut self,
+        base_tiles: u16,
+        pitch_tiles_32bpp: u32,
+        x: u32,
+        y: u32,
+        lo: u32,
+        hi: u32,
+    ) {
+        let pitch64 = pitch_tiles_32bpp.saturating_mul(2);
+        let base64 = (base_tiles as u32).saturating_mul(2) as u16;
+        self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2), y, lo);
+        self.write_sample_32bpp(base64, pitch64, x.saturating_mul(2) + 1, y, hi);
+    }
+
+    /// Bulk write a 64bpp rectangle from a row-major `(lo, hi)` linear
+    /// buffer.
+    #[allow(clippy::too_many_arguments)]
+    pub fn write_rect_64bpp(
+        &mut self,
+        base_tiles: u16,
+        pitch_tiles_32bpp: u32,
+        x: u32,
+        y: u32,
+        w: u32,
+        h: u32,
+        samples: &[(u32, u32)],
+    ) {
+        if w == 0 || h == 0 {
+            return;
+        }
+        for dy in 0..h {
+            let row_base = (dy as usize) * (w as usize);
+            for dx in 0..w {
+                let idx = row_base + dx as usize;
+                if idx >= samples.len() {
+                    return;
+                }
+                let (lo, hi) = samples[idx];
+                self.write_sample_64bpp(base_tiles, pitch_tiles_32bpp, x + dx, y + dy, lo, hi);
+            }
+        }
+    }
+
+    /// Fill a `(w × h)`-sample rectangle with a constant 64bpp pattern.
+    /// `lo` lands at the low-addressed 32bpp word, `hi` at the high one
+    /// — i.e. for clears, callers pass `(lo = RB_COLOR_CLEAR_LO,
+    /// hi = RB_COLOR_CLEAR)` per Canary `draw_util.cc:1302-1303`.
+    #[allow(clippy::too_many_arguments)]
+    pub fn fill_rect_64bpp(
+        &mut self,
+        base_tiles: u16,
+        pitch_tiles_32bpp: u32,
+        x: u32,
+        y: u32,
+        w: u32,
+        h: u32,
+        lo: u32,
+        hi: u32,
+    ) {
+        if w == 0 || h == 0 {
+            return;
+        }
+        for dy in 0..h {
+            for dx in 0..w {
+                self.write_sample_64bpp(
+                    base_tiles,
+                    pitch_tiles_32bpp,
+                    x + dx,
+                    y + dy,
+                    lo,
+                    hi,
+                );
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn shadow_edram_is_exactly_10_mib() {
+        assert_eq!(EDRAM_SIZE_BYTES, 10 * 1024 * 1024);
+        let e = ShadowEdram::new();
+        assert_eq!(e.as_bytes().len(), 10 * 1024 * 1024);
+    }
+
+    #[test]
+    fn fill_rect_writes_the_whole_first_tile() {
+        let mut e = ShadowEdram::new();
+        e.fill_rect_32bpp(0, 1, 0, 0, 80, 16, 0x11223344);
+        // Every 4-byte sample in tile 0 should be 0x11223344 (LE).
+        let expected = 0x11223344u32.to_le_bytes();
+        let tile = e.tile(0);
+        for chunk in tile.chunks_exact(4) {
+            assert_eq!(chunk, expected);
+        }
+    }
+
+    #[test]
+    fn fill_rect_respects_pitch_and_base() {
+        let mut e = ShadowEdram::new();
+        // Surface: pitch=2 tiles, base=5. A 160x16 fill should land in
+        // tiles 5 and 6 — and leave tile 4 / tile 7 / tile 0 untouched.
+        e.fill_rect_32bpp(5, 2, 0, 0, 160, 16, 0xAABBCCDD);
+        let expected = 0xAABBCCDDu32.to_le_bytes();
+        for chunk in e.tile(5).chunks_exact(4) {
+            assert_eq!(chunk, expected);
+        }
+        for chunk in e.tile(6).chunks_exact(4) {
+            assert_eq!(chunk, expected);
+        }
+        assert!(e.tile(4).iter().all(|&b| b == 0));
+        assert!(e.tile(7).iter().all(|&b| b == 0));
+        assert!(e.tile(0).iter().all(|&b| b == 0));
+    }
+
+    #[test]
+    fn fill_rect_wraps_mod_2048() {
+        let mut e = ShadowEdram::new();
+        // base=2047, pitch=2: first tile is 2047, second wraps to 0.
+        e.fill_rect_32bpp(2047, 2, 0, 0, 160, 16, 0xDEAD_BEEF);
+        let expected = 0xDEAD_BEEFu32.to_le_bytes();
+        for chunk in e.tile(2047).chunks_exact(4) {
+            assert_eq!(chunk, expected);
+        }
+        for chunk in e.tile(0).chunks_exact(4) {
+            assert_eq!(chunk, expected);
+        }
+    }
+
+    #[test]
+    fn read_sample_roundtrips_fill_rect() {
+        let mut e = ShadowEdram::new();
+        e.fill_rect_32bpp(3, 1, 0, 0, 80, 16, 0xCAFE_F00D);
+        // Sample any interior point.
+        assert_eq!(e.read_sample_32bpp(3, 1, 0, 0), 0xCAFE_F00D);
+        assert_eq!(e.read_sample_32bpp(3, 1, 79, 15), 0xCAFE_F00D);
+        // Untouched neighbouring tile.
+        assert_eq!(e.read_sample_32bpp(4, 1, 0, 0), 0);
+    }
+
+    #[test]
+    fn zero_pitch_is_a_noop_read() {
+        let e = ShadowEdram::new();
+        assert_eq!(e.read_sample_32bpp(0, 0, 10, 10), 0);
+    }
+
+    /// `write_sample_32bpp` round-trips through `read_sample_32bpp`.
+    #[test]
+    fn write_sample_32bpp_round_trips() {
+        let mut e = ShadowEdram::new();
+        for x in 0..80u32 {
+            for y in 0..16u32 {
+                e.write_sample_32bpp(0, 1, x, y, 0xABCD_0000 | (y << 8) | x);
+            }
+        }
+        for x in 0..80u32 {
+            for y in 0..16u32 {
+                assert_eq!(
+                    e.read_sample_32bpp(0, 1, x, y),
+                    0xABCD_0000 | (y << 8) | x,
+                    "round-trip mismatch at ({x},{y})"
+                );
+            }
+        }
+    }
+
+    /// `write_rect_32bpp` writes row-major samples into the right
+    /// sample-offsets, including across tile boundaries.
+    #[test]
+    fn write_rect_32bpp_crosses_tile_boundary() {
+        let mut e = ShadowEdram::new();
+        // Surface pitch = 2 tiles → x in [0, 160), y in [0, 16). A 100x4
+        // rect at (40, 4) crosses x=80 (tile boundary).
+        let w = 100u32;
+        let h = 4u32;
+        let mut samples = Vec::with_capacity((w * h) as usize);
+        for dy in 0..h {
+            for dx in 0..w {
+                samples.push(0x10000 | (dy << 8) | dx);
+            }
+        }
+        e.write_rect_32bpp(0, 2, 40, 4, w, h, &samples);
+        // Spot-check: (40, 4) lands in tile 0; (140, 4) in tile 1.
+        assert_eq!(e.read_sample_32bpp(0, 2, 40, 4), 0x1_0000);
+        assert_eq!(
+            e.read_sample_32bpp(0, 2, 139, 7),
+            0x10000 | (3 << 8) | 99
+        );
+    }
+
+    /// `read_sample_64bpp` round-trips through `write_sample_64bpp` —
+    /// doubled-pitch addressing keeps lo/hi adjacent in EDRAM bytes.
+    #[test]
+    fn write_read_sample_64bpp_roundtrips() {
+        let mut e = ShadowEdram::new();
+        // Use 32bpp pitch=1, base=0 → 64bpp pitch=2, base=0. A single-tile
+        // 64bpp surface fits 80x16 logical 64bpp samples? No — 80x16 32bpp
+        // samples per tile, 80 logical 64bpp samples per *pair* of tiles,
+        // and our 80×16 region needs 2 tiles. Stick to 16x4 logical 64bpp.
+        for x in 0..16u32 {
+            for y in 0..4u32 {
+                e.write_sample_64bpp(0, 1, x, y, 0xAAAA_0000 | x, 0xBBBB_0000 | y);
+            }
+        }
+        for x in 0..16u32 {
+            for y in 0..4u32 {
+                let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
+                assert_eq!(lo, 0xAAAA_0000 | x);
+                assert_eq!(hi, 0xBBBB_0000 | y);
+            }
+        }
+    }
+
+    /// `fill_rect_64bpp` writes both the lo and hi clear words across
+    /// a 64bpp surface — matches the `RB_COLOR_CLEAR_LO`/`RB_COLOR_CLEAR`
+    /// convention.
+    #[test]
+    fn fill_rect_64bpp_writes_both_words() {
+        let mut e = ShadowEdram::new();
+        // 16x4 logical 64bpp samples; pitch=1 32bpp tile → 2 64bpp tiles.
+        e.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xCAFE_F00D, 0xDEAD_BEEF);
+        for x in 0..16u32 {
+            for y in 0..4u32 {
+                let (lo, hi) = e.read_sample_64bpp(0, 1, x, y);
+                assert_eq!(lo, 0xCAFE_F00D);
+                assert_eq!(hi, 0xDEAD_BEEF);
+            }
+        }
+    }
+
+    /// 64bpp helpers must respect the doubled tile pitch — adjacent logical
+    /// 64bpp samples must land at adjacent 32bpp samples in EDRAM.
+    #[test]
+    fn sixty_four_bpp_uses_doubled_pitch() {
+        let mut e = ShadowEdram::new();
+        e.write_sample_64bpp(0, 1, 5, 0, 0x1111_1111, 0x2222_2222);
+        // The lo word must sit at 32bpp x=10 (5 << 1), hi at x=11.
+        // Doubled pitch -> base=0, pitch=2 32bpp.
+        assert_eq!(e.read_sample_32bpp(0, 2, 10, 0), 0x1111_1111);
+        assert_eq!(e.read_sample_32bpp(0, 2, 11, 0), 0x2222_2222);
+    }
+
+    /// `write_rect_*` with empty dimensions is a no-op.
+    #[test]
+    fn write_rect_empty_is_noop() {
+        let mut e = ShadowEdram::new();
+        e.write_rect_32bpp(0, 1, 0, 0, 0, 5, &[1, 2, 3]);
+        e.write_rect_32bpp(0, 1, 0, 0, 5, 0, &[1, 2, 3]);
+        e.fill_rect_64bpp(0, 1, 0, 0, 0, 5, 1, 2);
+        e.fill_rect_64bpp(0, 1, 0, 0, 5, 0, 1, 2);
+        // Nothing should have been written.
+        assert!(e.as_bytes().iter().all(|&b| b == 0));
+    }
+}
diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs
new file mode 100644
index 0000000..079759f
--- /dev/null
+++ b/crates/xenia-gpu/src/gpu_system.rs
@@ -0,0 +1,1753 @@
+//! Xenos GPU system: register file + primary ring buffer + PM4 executor.
+//!
+//! Design notes mirror the approved plan's P2 slice:
+//!
+//! - Runs on the same host thread as the CPU interpreter. Sequential access
+//!   to `GuestMemory` — no locks, no sharing.
+//! - One packet per [`GpuSystem::execute_one`] call. The scheduler calls this
+//!   once per round when `is_ready` returns true. When the packet is a
+//!   `WAIT_REG_MEM` whose condition isn't yet satisfied, we transition to
+//!   [`GpuState::Blocked`] and the scheduler will re-poll us.
+//! - Non-draw opcodes execute for real (register/memory writes, event
+//!   writebacks). Draws (`DRAW_INDX*`, `XE_SWAP`) are classified but not
+//!   rendered yet; they surface state (via spans + the swap hook) for later
+//!   phases to consume.
+//!
+//! Opcode reference: `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h`.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
+
+use xenia_memory::MemoryAccess;
+
+use crate::draw_state::{self, DrawState};
+use crate::pm4::{self, PacketKind};
+use crate::primitive::{self, ProcessedPrimitive};
+use crate::register_file::RegisterFile;
+use crate::ring_view::RingBufferView;
+
+/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
+#[derive(Debug, Clone)]
+pub struct ShaderBlob {
+    pub shader_type: u8, // 0 = vertex, 1 = pixel
+    pub dwords: Vec<u32>,
+}
+
+/// P8 — upper bound on the shader-blob cache (`GpuSystem.shader_blobs`).
+/// Canary uses a similar FIFO ceiling; our number is deliberately generous
+/// because blobs are small (a few KiB each at most) and misses force a
+/// re-upload from the guest the next time `IM_LOAD*` fires. 256 is enough
+/// for every shipping game's peak working set, per canary's traces.
+pub const SHADER_BLOB_CAP: usize = 256;
+
+/// PM4 `WAIT_REG_MEM` condition. Bits 0..=2 of the wait_info dword encode the
+/// comparison (per `xenia-canary/src/xenia/gpu/pm4_command_processor_implement.h:685-696`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum WaitCmp {
+    /// value < ref
+    Less,
+    /// value <= ref
+    LessEq,
+    /// value == ref
+    Equal,
+    /// value != ref
+    NotEqual,
+    /// value >= ref
+    GreaterEq,
+    /// value > ref
+    Greater,
+    /// Always — caller wants to sleep regardless.
+    Always,
+}
+
+impl WaitCmp {
+    /// Interpret the lower 3 bits of `wait_info` per canary's `MatchValueAndRef`.
+    pub fn from_wait_info(wait_info: u32) -> Self {
+        match wait_info & 0x7 {
+            0 => WaitCmp::Less,
+            1 => WaitCmp::LessEq,
+            2 => WaitCmp::Equal,
+            3 => WaitCmp::NotEqual,
+            4 => WaitCmp::GreaterEq,
+            5 => WaitCmp::Greater,
+            _ => WaitCmp::Always,
+        }
+    }
+
+    pub fn evaluate(self, value: u32, reference: u32) -> bool {
+        match self {
+            WaitCmp::Less => value < reference,
+            WaitCmp::LessEq => value <= reference,
+            WaitCmp::Equal => value == reference,
+            WaitCmp::NotEqual => value != reference,
+            WaitCmp::GreaterEq => value >= reference,
+            WaitCmp::Greater => value > reference,
+            WaitCmp::Always => true,
+        }
+    }
+}
+
+/// Reason the GPU is currently parked. Mirrors the CPU-side scheduler
+/// `BlockReason` shape. Currently only `WaitRegMem` — more to come in later
+/// phases (interrupts, timestamp waits).
+#[derive(Debug, Clone)]
+pub enum GpuBlock {
+    WaitRegMem {
+        poll_addr: u32,
+        is_memory: bool,
+        reference: u32,
+        mask: u32,
+        cmp: WaitCmp,
+    },
+}
+
+impl GpuBlock {
+    /// Probe the wait condition. Returns `true` if the condition holds and
+    /// the GPU should be unparked.
+    pub fn is_satisfied(&self, mem: &dyn MemoryAccess, reg_file: &RegisterFile) -> bool {
+        match self {
+            GpuBlock::WaitRegMem {
+                poll_addr,
+                is_memory,
+                reference,
+                mask,
+                cmp,
+            } => {
+                let value = if *is_memory {
+                    mem.read_u32(*poll_addr)
+                } else {
+                    reg_file.read(*poll_addr)
+                };
+                cmp.evaluate(value & *mask, *reference)
+            }
+        }
+    }
+}
+
+/// Public notification the CP emits when the guest presents a frame. The
+/// kernel `vd_swap` path gates on this to push a `SwapInfo` to the host UI.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct SwapNotification {
+    pub frame_index: u64,
+    pub frontbuffer_phys: u32,
+    pub width: u32,
+    pub height: u32,
+}
+
+/// GPU interrupt (fired by `PM4_INTERRUPT`). The kernel dispatches this to
+/// the guest callback registered by `VdSetGraphicsInterruptCallback`.
+#[derive(Debug, Clone, Copy)]
+pub struct PendingInterrupt {
+    pub source: InterruptSource,
+    pub cpu_mask: u32,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum InterruptSource {
+    CommandProcessor,
+    Swap,
+}
+
+/// Per-run counters for observability.
+#[derive(Debug, Clone, Default)]
+pub struct GpuStats {
+    pub packets_executed: u64,
+    pub draws_seen: u64,
+    pub swaps_seen: u64,
+    pub interrupts_emitted: u64,
+    pub wait_reg_mem_blocks: u64,
+    pub indirect_buffer_jumps: u64,
+    /// P4: count of EDRAM→memory resolves triggered by `TILE_FLUSH` events
+    /// (event code 15). Non-zero means the game is committing rendered
+    /// pixels to the frontbuffer / a texture.
+    pub resolves_total: u64,
+    /// Resolves whose byte copy path ran and wrote at least one sample to
+    /// guest memory. Delta against `resolves_total` indicates how many
+    /// resolves were skipped for an unsupported format / MSAA mode / 3D
+    /// destination.
+    pub resolves_copied_total: u64,
+    /// Resolves that were skipped by [`crate::resolve::copy_to_memory`] due
+    /// to an unsupported format path. Logged at `warn` so the reason is
+    /// visible.
+    pub resolves_skipped_total: u64,
+    /// Total number of 32bpp samples written into guest memory across all
+    /// successful resolves. Useful for sanity-checking that a big splash
+    /// frame actually made it out (e.g. 1280×720 = 921_600 samples).
+    pub resolve_samples_written: u64,
+    /// P4: unique render-target keys seen (as managed by the internal
+    /// `RenderTargetCache`). Useful HUD metric for multi-target workloads.
+    pub unique_render_targets: u64,
+}
+
+/// Result of one packet step.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ExecOutcome {
+    /// Consumed one packet; GPU remains Ready.
+    Stepped { dwords_consumed: u32 },
+    /// Nothing to do right now.
+    Idle,
+    /// Parked on a sync primitive; `GpuSystem::pending_block` has details.
+    Blocked,
+}
+
+/// Shader fetch constants: the game uploads them via `PM4_SET_CONSTANT` type=1
+/// into a 256-dword region. Games then reference them by index when binding
+/// textures / vertex buffers.
+pub const CONST_BASE_ALU: u32 = 0x4000;
+pub const CONST_BASE_FETCH: u32 = 0x4800;
+pub const CONST_BASE_BOOL: u32 = 0x4900;
+pub const CONST_BASE_LOOP: u32 = 0x4908;
+pub const CONST_BASE_REGISTERS: u32 = 0x2000;
+
+/// Atomic mailbox for the handful of GPU registers that CROSS the MMIO
+/// boundary. Guests write into the `0x7FC80000` register aperture; those
+/// writes run through [`crate::mmio_region`] and land in these atomics.
+/// Inside `execute_one` / the scheduler's per-round GPU hook we sample them
+/// to sync `ring.write_offset_dwords`, reflect progress back to the guest,
+/// etc.
+///
+/// Only these three registers need atomic cross-thread access. Everything
+/// else lives in [`GpuSystem::register_file`] which is CPU-thread-local.
+#[derive(Debug, Clone)]
+pub struct GpuMmio {
+    /// `CP_RB_WPTR` — guest writes dword offset of the write pointer.
+    pub cp_rb_wptr: Arc<AtomicU32>,
+    /// `CP_RB_RPTR` — we mirror our internal `ring.read_offset_dwords` here
+    /// so guests polling the register see progress.
+    pub cp_rb_rptr: Arc<AtomicU32>,
+    /// `CP_INT_STATUS` — bit set when an interrupt is pending.
+    pub cp_int_status: Arc<AtomicU32>,
+    /// `CP_INT_ACK` — guest clears the bit after handling.
+    pub cp_int_ack: Arc<AtomicU32>,
+    /// `D1MODE_VBLANK_VLINE_STATUS` (register `0x1951`, MMIO offset `0x6544`).
+    /// Bit 0 = `VBLANK_INT_OCCURRED` — set by the GPU when vblank fires,
+    /// cleared by the guest via write-1-to-clear. Sylpheed's vsync callback
+    /// gates *all* its work on reading bit 0 as set: `lwz; rlwinm. r,r,0,31,31;
+    /// bc 12,2,skip`. Without this bit toggling across vsyncs the callback
+    /// always skips, so the PKEVENT that feeds the render dispatcher
+    /// (user_data + 0x3B28) never gets signaled and the worker loops
+    /// forever.
+    pub d1mode_vblank_vline_status: Arc<AtomicU32>,
+    /// M1.7 parker — set by producers (guest WPTR writes, shutdown) so
+    /// the GPU worker thread does not park when work is pending. The
+    /// worker swaps to `false` on entering its park decision and
+    /// re-checks predicates; if a producer raced between the swap and
+    /// the actual `park_timeout`, the producer's `unpark()` returns the
+    /// park immediately via std's token semantics. Inline mode never
+    /// reads this; the cost is one extra atomic store per WPTR write.
+    pub wake_pending: Arc<AtomicBool>,
+    /// Handle to the GPU worker thread, populated by `GpuWorker::run` on
+    /// startup. The MMIO write callback for `CP_RB_WPTR` `unpark()`s it
+    /// after every guest WPTR write so the worker proceeds without
+    /// waiting for its `park_timeout`. `None` in inline mode (no worker
+    /// to wake), in which case the unpark site is a one-mutex-lock
+    /// no-op.
+    pub worker_thread: Arc<std::sync::Mutex<Option<std::thread::Thread>>>,
+}
+
+impl GpuMmio {
+    pub fn new() -> Self {
+        Self {
+            cp_rb_wptr: Arc::new(AtomicU32::new(0)),
+            cp_rb_rptr: Arc::new(AtomicU32::new(0)),
+            cp_int_status: Arc::new(AtomicU32::new(0)),
+            cp_int_ack: Arc::new(AtomicU32::new(0)),
+            d1mode_vblank_vline_status: Arc::new(AtomicU32::new(0)),
+            wake_pending: Arc::new(AtomicBool::new(false)),
+            worker_thread: Arc::new(std::sync::Mutex::new(None)),
+        }
+    }
+}
+
+impl Default for GpuMmio {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Live GPU system. One instance per `KernelState`.
+pub struct GpuSystem {
+    pub register_file: RegisterFile,
+    pub ring: RingBufferView,
+    /// Stack of saved rings for `PM4_INDIRECT_BUFFER` nesting. The active
+    /// ring is always `ring`; when an IB packet arrives, we push `ring` onto
+    /// this stack and replace `ring` with the IB view. On IB completion
+    /// (read pointer catches up to size), we pop.
+    ib_stack: Vec<RingBufferView>,
+    /// Cached shader blobs keyed by the raw CP register address that loaded them.
+    pub shader_blobs: HashMap<u32, ShaderBlob>,
+    /// P8 — FIFO of blob keys for bounded eviction. On `IM_LOAD*` the
+    /// new key is pushed to the back; if the blob count exceeds
+    /// [`SHADER_BLOB_CAP`], the front is popped and removed from
+    /// `shader_blobs`. Prevents long-running guests from growing the
+    /// cache without bound. The two *active* keys (`active_vs_key` +
+    /// `active_ps_key`) are never evicted — safeguard in `evict_oldest`.
+    pub shader_blob_order: std::collections::VecDeque<u32>,
+    /// Monotonic frame counter (bumped on `PM4_XE_SWAP`).
+    pub swap_counter: u64,
+    /// Most recent swap notification; the kernel polls this after `execute_one`
+    /// to decide whether to push a UI swap event.
+    pub last_swap: Option<SwapNotification>,
+    /// Queue of interrupts not yet delivered to the guest. Private so that
+    /// callers go through [`Self::take_pending_interrupts`] — M1 step 6
+    /// then redirects this drain into a `crossbeam_channel::Sender` without
+    /// re-touching every call site.
+    pending_interrupts: Vec<PendingInterrupt>,
+    /// Current stall reason, if any.
+    pub pending_block: Option<GpuBlock>,
+    pub stats: GpuStats,
+    /// For the 64-bit bin mask/select we split hi/lo writes.
+    pub bin_mask: u64,
+    pub bin_select: u64,
+    /// Shared-atomic mailbox for the `0x7FC80000` MMIO aperture. Cloned into
+    /// the [`crate::mmio_region`] callbacks; clone-ownership keeps the bus
+    /// side and the executor side in sync without locks.
+    pub mmio: GpuMmio,
+    /// Most recent draw state extracted at a `PM4_DRAW_INDX*` packet. The
+    /// uber-shader pipeline in P3+ reads this to build its wgpu draw call.
+    pub last_draw: Option<DrawState>,
+    /// Most recent processed primitive — index rewrite + host topology
+    /// decision. Separate from `last_draw` because its `rewritten_indices`
+    /// may be large and callers may want to drop it after consumption.
+    pub last_primitive: Option<ProcessedPrimitive>,
+    /// Key in `shader_blobs` of the currently-active vertex shader. Set by
+    /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kVertex` and read at
+    /// `PM4_DRAW_INDX*` time so the host side can upload the matching
+    /// microcode bytes before dispatching.
+    pub active_vs_key: Option<u32>,
+    /// Key in `shader_blobs` of the currently-active pixel shader. Set by
+    /// `PM4_IM_LOAD[_IMMEDIATE]` for `ShaderType::kPixel`.
+    pub active_ps_key: Option<u32>,
+    /// P4: EDRAM tile-ownership bookkeeping + per-key RT descriptors. Updated
+    /// at each `PM4_DRAW_INDX*` (`bind` + `claim_tiles`) and queried by
+    /// `TILE_FLUSH` event handling to decide resolve sources.
+    pub rt_cache: crate::render_target_cache::RenderTargetCache,
+    /// P4: most recent `ResolveInfo` observed when `TILE_FLUSH` fired. The UI
+    /// bridge surfaces this in the HUD so users can tell when a game is
+    /// resolving to the frontbuffer versus an off-screen target.
+    pub last_resolve: Option<crate::draw_state::ResolveInfo>,
+    /// P5: CPU-side decoded-texture cache (shared across draws within a
+    /// frame; trimmed implicitly by insertion). `ensure_cached` hits this
+    /// on every texture-fetch resolution; the UI thread sees the decoded
+    /// bytes via `UiBridge::publish_texture`.
+    pub texture_cache: crate::texture_cache::TextureCache,
+    /// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
+    /// (future) host-render-target readback; read by the resolve byte-copy
+    /// path that writes tiled pixels into guest memory. Allocated once at
+    /// `GpuSystem::new` and lives for the whole GPU lifetime — no
+    /// per-frame churn.
+    pub edram: crate::edram::ShadowEdram,
+}
+
+impl GpuSystem {
+    pub fn new() -> Self {
+        Self {
+            register_file: RegisterFile::new(),
+            ring: RingBufferView::new(),
+            ib_stack: Vec::new(),
+            shader_blobs: HashMap::new(),
+            shader_blob_order: std::collections::VecDeque::with_capacity(SHADER_BLOB_CAP + 1),
+            swap_counter: 0,
+            last_swap: None,
+            pending_interrupts: Vec::new(),
+            pending_block: None,
+            stats: GpuStats::default(),
+            bin_mask: 0,
+            bin_select: 0,
+            mmio: GpuMmio::new(),
+            last_draw: None,
+            last_primitive: None,
+            active_vs_key: None,
+            active_ps_key: None,
+            rt_cache: crate::render_target_cache::RenderTargetCache::new(),
+            last_resolve: None,
+            texture_cache: crate::texture_cache::TextureCache::new(),
+            edram: crate::edram::ShadowEdram::new(),
+        }
+    }
+
+    /// P8 — insert a shader blob + bump the FIFO so long-running games
+    /// don't grow `shader_blobs` without bound. Caps at [`SHADER_BLOB_CAP`].
+    /// Never evicts the currently-active VS/PS blobs (if they ended up at
+    /// the front of the queue, we skip past them).
+    fn insert_shader_blob(&mut self, key: u32, blob: ShaderBlob) {
+        let already_present = self.shader_blobs.contains_key(&key);
+        self.shader_blobs.insert(key, blob);
+        if !already_present {
+            self.shader_blob_order.push_back(key);
+            metrics::counter!("gpu.shader.blob_seen").increment(1);
+        }
+        while self.shader_blobs.len() > SHADER_BLOB_CAP {
+            // Pop the oldest key that isn't one of the active ones.
+            let mut evicted = None;
+            for _ in 0..self.shader_blob_order.len() {
+                if let Some(candidate) = self.shader_blob_order.pop_front() {
+                    if Some(candidate) == self.active_vs_key
+                        || Some(candidate) == self.active_ps_key
+                    {
+                        self.shader_blob_order.push_back(candidate);
+                        continue;
+                    }
+                    self.shader_blobs.remove(&candidate);
+                    evicted = Some(candidate);
+                    break;
+                }
+            }
+            if evicted.is_some() {
+                metrics::counter!("gpu.shader.blob_evicted").increment(1);
+            } else {
+                // All remaining blobs are active — can't evict, stop.
+                break;
+            }
+        }
+    }
+
+    /// P4: event dispatch helper called by every `PM4_EVENT_WRITE*` variant.
+    /// `event_code` is the low 6 bits of the initiator word (see canary's
+    /// `xenos::Event` enum — code 15 is `TILE_FLUSH`, the resolve trigger).
+    ///
+    /// Handle a `PM4_EVENT_WRITE*` initiator. For `TILE_FLUSH` (event 15)
+    /// we decode the live `RB_*` register state into a [`ResolveInfo`],
+    /// paint any clear values into the shadow EDRAM, and then copy bytes
+    /// from the source render target into guest memory at
+    /// `RB_COPY_DEST_BASE`. That last step is what `VdSwap` needs to see
+    /// non-empty front-buffer pixels — see `memory/project_xenia_rs_edram
+    /// _resolve_gap.md` for the history of this path.
+    fn handle_event_initiator(&mut self, event_code: u32, mem: &dyn MemoryAccess) {
+        const EVENT_TILE_FLUSH: u32 = 15;
+        if event_code != EVENT_TILE_FLUSH {
+            return;
+        }
+        let info = draw_state::ResolveInfo::from_register_file_and_memory(
+            &self.register_file,
+            mem,
+        );
+        self.stats.resolves_total += 1;
+        metrics::counter!(
+            "gpu.resolve",
+            "src" => format!("{}", info.copy_src_select),
+            "fmt" => format!("{}", info.dest_format),
+            "cmd" => format!("{}", info.copy_command),
+        )
+        .increment(1);
+        tracing::info!(
+            src = info.copy_src_select,
+            dst_base = format_args!("{:#010x}", info.dest_base),
+            w = info.coords.width,
+            h = info.coords.height,
+            pitch = info.dest_pitch_pixels,
+            fmt = info.dest_format,
+            endian = info.dest_endian,
+            clear_color = info.color_clear_enable,
+            clear_depth = info.depth_clear_enable,
+            "gpu: TILE_FLUSH resolve"
+        );
+
+        // Paint clear values into the shadow EDRAM at the source tile
+        // range *before* the copy. Games often issue a clear-then-resolve
+        // as a single TILE_FLUSH: the EDRAM is filled with `RB_COLOR_CLEAR`
+        // by the clear part, and that's what the copy part reads.
+        //
+        // Sample coordinates are pixel coordinates scaled up by
+        // `sample_count_log2_x/y` — for 1x MSAA (Sylpheed) this is the
+        // identity.
+        if info.color_clear_enable
+            && let draw_state::ResolveSource::Color(_) = info.source
+            && info.surface_pitch_tiles > 0
+        {
+            let sx = info.coords.x0 << info.coords.sample_count_log2_x;
+            let sy = info.coords.y0 << info.coords.sample_count_log2_y;
+            let sw = info.coords.width << info.coords.sample_count_log2_x;
+            let sh = info.coords.height << info.coords.sample_count_log2_y;
+            // 64bpp clears use both `RB_COLOR_CLEAR_LO` (low 32 bits) and
+            // `RB_COLOR_CLEAR` (high 32 bits) per Canary `draw_util.cc:1302-1303`.
+            // 32bpp clears ignore the lo word entirely.
+            if info.source_is_64bpp {
+                self.edram.fill_rect_64bpp(
+                    info.source_base_tiles,
+                    info.surface_pitch_tiles,
+                    sx,
+                    sy,
+                    sw,
+                    sh,
+                    info.color_clear_value_lo,
+                    info.color_clear_value,
+                );
+            } else {
+                self.edram.fill_rect_32bpp(
+                    info.source_base_tiles,
+                    info.surface_pitch_tiles,
+                    sx,
+                    sy,
+                    sw,
+                    sh,
+                    info.color_clear_value,
+                );
+            }
+        }
+        if info.depth_clear_enable && info.surface_pitch_tiles > 0 {
+            let sx = info.coords.x0 << info.coords.sample_count_log2_x;
+            let sy = info.coords.y0 << info.coords.sample_count_log2_y;
+            let sw = info.coords.width << info.coords.sample_count_log2_x;
+            let sh = info.coords.height << info.coords.sample_count_log2_y;
+            // Depth tiles live at RB_DEPTH_INFO.depth_base regardless of
+            // which source this resolve selects.
+            let rb_depth_info = self.register_file.read(draw_state::reg::RB_DEPTH_INFO);
+            let depth_base = (rb_depth_info & 0xFFF) as u16;
+            self.edram.fill_rect_32bpp(
+                depth_base,
+                info.surface_pitch_tiles,
+                sx,
+                sy,
+                sw,
+                sh,
+                info.depth_clear_value,
+            );
+        }
+
+        // Byte copy into guest memory.
+        let stats = crate::resolve::copy_to_memory(&info, &self.edram, mem);
+        if stats.supported && stats.samples_written > 0 {
+            self.stats.resolves_copied_total += 1;
+            self.stats.resolve_samples_written += stats.samples_written as u64;
+        } else if !stats.supported {
+            self.stats.resolves_skipped_total += 1;
+        }
+
+        self.last_resolve = Some(info);
+    }
+
+    /// Sync state with the MMIO atomic mailbox. Call once at the top of the
+    /// per-round GPU hook: the guest may have advanced `CP_RB_WPTR` since
+    /// we last ran, and we in turn reflect our read-pointer back to the
+    /// mirror register so the guest sees progress.
+    pub fn sync_with_mmio(&mut self) {
+        let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Relaxed);
+        if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 {
+            self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords;
+        }
+        // Mirror our read pointer.
+        self.mmio
+            .cp_rb_rptr
+            .store(self.ring.read_offset_dwords, Ordering::Relaxed);
+    }
+
+    /// True iff `execute_one` is expected to make progress without blocking.
+    pub fn is_ready(&self, mem: &dyn MemoryAccess) -> bool {
+        if let Some(block) = &self.pending_block {
+            return block.is_satisfied(mem, &self.register_file);
+        }
+        self.ring.has_pending()
+    }
+
+    /// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when
+    /// there's nothing to do, [`ExecOutcome::Blocked`] if a sync primitive
+    /// stalls the GPU, otherwise [`ExecOutcome::Stepped`] with the number of
+    /// dwords consumed (counting the header).
+    pub fn execute_one(&mut self, mem: &dyn MemoryAccess) -> ExecOutcome {
+        // 0) If currently parked, probe the condition and either wake up or stay blocked.
+        if let Some(block) = self.pending_block.clone() {
+            if block.is_satisfied(mem, &self.register_file) {
+                tracing::debug!(?block, "gpu: wait satisfied — resuming");
+                self.pending_block = None;
+            } else {
+                return ExecOutcome::Blocked;
+            }
+        }
+        if !self.ring.has_pending() {
+            // End of current ring. If we were inside an indirect buffer, pop
+            // and resume the caller.
+            if let Some(caller) = self.ib_stack.pop() {
+                self.ring = caller;
+                if self.ring.has_pending() {
+                    return self.execute_one(mem);
+                }
+            }
+            return ExecOutcome::Idle;
+        }
+        let header_addr = self.ring.addr_at_offset(0).unwrap();
+        let header_word = mem.read_u32(header_addr);
+        let packet = pm4::decode(header_word);
+        tracing::trace!(
+            header = format_args!("{header_word:#010x}"),
+            addr = format_args!("{header_addr:#010x}"),
+            ?packet.kind,
+            "gpu: packet"
+        );
+        let consumed = match packet.kind {
+            PacketKind::Type0 { base_index, count, write_one } => {
+                self.handle_type0(mem, base_index, count, write_one, packet.total_dwords)
+            }
+            PacketKind::Type1 { reg_index_1, reg_index_2 } => {
+                self.handle_type1(mem, reg_index_1, reg_index_2)
+            }
+            PacketKind::Type2 => 1,
+            PacketKind::Type3 {
+                opcode,
+                count,
+                predicated,
+            } => match self.handle_type3(mem, opcode, count, predicated, packet.total_dwords) {
+                Type3Result::Consumed(n) => n,
+                Type3Result::Blocked { rewind_to_header } => {
+                    // Re-park on this packet so the resume path re-reads it.
+                    if rewind_to_header {
+                        // We haven't moved read ptr yet, so this is a no-op —
+                        // documented to keep intent explicit.
+                    }
+                    return ExecOutcome::Blocked;
+                }
+            },
+        };
+        self.ring.advance_read(consumed);
+        self.writeback_read_ptr(mem);
+        self.stats.packets_executed += 1;
+        ExecOutcome::Stepped {
+            dwords_consumed: consumed,
+        }
+    }
+
+    /// First-Pixels M2b — kernel-side commit point for `VdSwap`. Prior to
+    /// M2b the kernel's [`vd_swap`] wrote a synthetic `PM4_XE_SWAP` packet
+    /// at the guest-provided `buffer_ptr` + advanced our ring `wptr` by 64
+    /// dwords, expecting the drain to pick it up. That mechanism misaligned:
+    /// the drain reads from `ring.base + rptr * 4` forward, not from the
+    /// game's out-of-band `buffer_ptr`. 512 ring packets executed through
+    /// 1 B guest instructions but `swaps_seen` stayed at 0.
+    ///
+    /// `VdSwap` is the kernel's commit point by definition — we don't need
+    /// to launder the event through the ring. Call this directly from the
+    /// kernel-side handler; the `PM4_XE_SWAP` opcode path still works for
+    /// the (rare) case of a game that emits the packet through its own ring
+    /// writes.
+    pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) {
+        self.stats.swaps_seen += 1;
+        self.swap_counter = self.swap_counter.wrapping_add(1);
+        self.last_swap = Some(SwapNotification {
+            frame_index: self.swap_counter,
+            frontbuffer_phys,
+            width,
+            height,
+        });
+        self.pending_interrupts.push(PendingInterrupt {
+            source: InterruptSource::Swap,
+            cpu_mask: 0x1,
+        });
+        tracing::info!(
+            frame = self.swap_counter,
+            fb = format_args!("{frontbuffer_phys:#010x}"),
+            width,
+            height,
+            "gpu: XE_SWAP (kernel-direct)"
+        );
+    }
+
+    /// Called by `VdInitializeRingBuffer` to give us the primary ring.
+    pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
+        let size_bytes = 1u32 << size_log2.min(31);
+        self.ring.base = base;
+        self.ring.size_dwords = size_bytes / 4;
+        self.ring.read_offset_dwords = 0;
+        // `write_offset` is driven by the guest — start at 0 so the ring
+        // appears empty until MMIO writes advance it.
+        self.ring.write_offset_dwords = 0;
+        tracing::info!(
+            base = format_args!("{base:#010x}"),
+            size_bytes,
+            size_dwords = self.ring.size_dwords,
+            "gpu: ring initialized"
+        );
+    }
+
+    /// Called by `VdEnableRingBufferRPtrWriteBack` to record where the guest
+    /// expects us to mirror `read_offset_dwords`.
+    pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) {
+        self.ring.rptr_writeback_addr = addr;
+        self.ring.rptr_writeback_block_dwords = 1u32 << block_log2.min(31);
+        tracing::info!(
+            addr = format_args!("{addr:#010x}"),
+            block_dwords = self.ring.rptr_writeback_block_dwords,
+            "gpu: rptr writeback enabled"
+        );
+    }
+
+    /// Drain the pending-interrupt queue. The kernel calls this once per
+    /// scheduler round and queues each entry into `interrupts.queue_interrupt`.
+    ///
+    /// M1 step 6 swaps the `Vec`-backed implementation for a
+    /// `crossbeam_channel::Sender<PendingInterrupt>`. Routing every external
+    /// reader through this single accessor in step 2 means that swap is a
+    /// localized change — no call site changes.
+    ///
+    /// Returns the previously-queued interrupts and leaves the internal queue
+    /// empty. Cheap (`Vec::take`); no allocation when the queue is already
+    /// empty.
+    pub fn take_pending_interrupts(&mut self) -> Vec<PendingInterrupt> {
+        std::mem::take(&mut self.pending_interrupts)
+    }
+
+    /// True when the pending-interrupt queue has at least one entry. Used
+    /// by callers that want to short-circuit an empty drain (saving the
+    /// `Vec::new()` allocation that `take` would otherwise force on every
+    /// scheduler round).
+    pub fn has_pending_interrupts(&self) -> bool {
+        !self.pending_interrupts.is_empty()
+    }
+
+    /// Extend the logical write pointer by `dwords` (cumulative). `VdSwap`
+    /// reserves 64 dwords then calls this; MMIO writes to `CP_RB_WPTR` will
+    /// do the same in P2+.
+    pub fn extend_write_ptr(&mut self, new_wptr_dwords: u32) {
+        if self.ring.size_dwords == 0 {
+            return;
+        }
+        self.ring.write_offset_dwords = new_wptr_dwords % self.ring.size_dwords;
+    }
+
+    /// Write the current read pointer back to the guest-registered
+    /// address. M1.8 uses the fenced variant: when the GPU runs on its
+    /// own host thread, the CPU can poll this RPTR mirror to learn how
+    /// far the GPU has consumed the ring; the Release fence ensures any
+    /// upstream packet effects (memory writes, register file updates
+    /// the guest reads via subsequent MMIO) happen-before the
+    /// CPU-visible RPTR bump.
+    fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) {
+        if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() {
+            mem.write_u32_fence(
+                self.ring.rptr_writeback_addr,
+                self.ring.read_offset_dwords,
+            );
+        }
+    }
+
+    // ── Type-0/1 handlers ─────────────────────────────────────────────────
+
+    fn handle_type0(
+        &mut self,
+        mem: &dyn MemoryAccess,
+        base_index: u32,
+        count: u32,
+        write_one: bool,
+        total_dwords: u32,
+    ) -> u32 {
+        for i in 0..count {
+            let dword_addr = self.ring.addr_at_offset(1 + i).unwrap();
+            let value = mem.read_u32(dword_addr);
+            let target = if write_one { base_index } else { base_index + i };
+            self.register_file.write(target, value);
+        }
+        tracing::trace!(
+            base = format_args!("{base_index:#x}"),
+            count,
+            write_one,
+            "gpu: Type0 reg write run"
+        );
+        total_dwords
+    }
+
+    fn handle_type1(
+        &mut self,
+        mem: &dyn MemoryAccess,
+        reg_index_1: u32,
+        reg_index_2: u32,
+    ) -> u32 {
+        let a_addr = self.ring.addr_at_offset(1).unwrap();
+        let b_addr = self.ring.addr_at_offset(2).unwrap();
+        let a = mem.read_u32(a_addr);
+        let b = mem.read_u32(b_addr);
+        self.register_file.write(reg_index_1, a);
+        self.register_file.write(reg_index_2, b);
+        tracing::trace!(
+            r1 = format_args!("{reg_index_1:#x}"),
+            r2 = format_args!("{reg_index_2:#x}"),
+            "gpu: Type1 dual reg write"
+        );
+        3
+    }
+
+    // ── Type-3 dispatch ───────────────────────────────────────────────────
+
+    fn handle_type3(
+        &mut self,
+        mem: &dyn MemoryAccess,
+        opcode: u8,
+        count: u32,
+        predicated: bool,
+        total_dwords: u32,
+    ) -> Type3Result {
+        metrics::counter!("gpu.packet", "opcode" => pm4::type3_opcode_name(opcode)).increment(1);
+        tracing::trace!(
+            opcode = format_args!("{opcode:#x}"),
+            name = pm4::type3_opcode_name(opcode),
+            count,
+            predicated,
+            "gpu: Type3"
+        );
+        // If predicated and the bin mask/select combo evaluates to "skip",
+        // consume the whole packet (including data dwords) and move on. We
+        // don't emulate binning so bin_mask & bin_select is always 0 → we
+        // keep predicated packets in simplest form: execute them anyway. Most
+        // games don't use binning on Xenos. Observed in canary:
+        // `pm4_command_processor_implement.h:440-460`.
+        let _ = predicated;
+
+        match opcode {
+            pm4::PM4_NOP
+            | pm4::PM4_WAIT_FOR_IDLE
+            | pm4::PM4_CONTEXT_UPDATE
+            | pm4::PM4_INVALIDATE_STATE
+            | pm4::PM4_ME_INIT
+            | pm4::PM4_VIZ_QUERY
+            | pm4::PM4_SET_SHADER_BASES => {
+                // Classify-and-skip. State side effects (if any) are deferred.
+            }
+            pm4::PM4_INDIRECT_BUFFER | pm4::PM4_INDIRECT_BUFFER_PFD => {
+                self.stats.indirect_buffer_jumps += 1;
+                let ib_ptr = self.read_payload(mem, 1);
+                let ib_size = self.read_payload(mem, 2);
+                // Advance past the IB header + payload before recursing so
+                // the return location is correct.
+                self.ring.advance_read(total_dwords);
+                self.writeback_read_ptr(mem);
+                // Push current ring, switch to IB view.
+                let caller = self.ring;
+                self.ib_stack.push(caller);
+                self.ring = RingBufferView {
+                    base: ib_ptr & !3,
+                    size_dwords: ib_size,
+                    read_offset_dwords: 0,
+                    write_offset_dwords: ib_size, // IB is fully-written at jump time
+                    rptr_writeback_addr: 0,
+                    rptr_writeback_block_dwords: 0,
+                };
+                tracing::debug!(
+                    ib_ptr = format_args!("{ib_ptr:#010x}"),
+                    ib_size,
+                    "gpu: jump to indirect buffer"
+                );
+                return Type3Result::Consumed(0); // we already advanced
+            }
+            pm4::PM4_WAIT_REG_MEM => {
+                // Canary layout (pm4_command_processor_implement.h:699-755):
+                //   payload[0] = wait_info (bit 4 = is_memory, bits 2:0 = cmp)
+                //   payload[1] = poll address (register idx OR memory addr; bottom 2 bits = endian for memory)
+                //   payload[2] = ref value
+                //   payload[3] = mask
+                //   payload[4] = wait (sleep hint, ignored)
+                let wait_info = self.read_payload(mem, 1);
+                let poll_addr_raw = self.read_payload(mem, 2);
+                let reference = self.read_payload(mem, 3);
+                let mask = self.read_payload(mem, 4);
+                let is_memory = (wait_info & 0x10) != 0;
+                let cmp = WaitCmp::from_wait_info(wait_info);
+                let poll_addr = if is_memory {
+                    poll_addr_raw & !3
+                } else {
+                    poll_addr_raw
+                };
+                let block = GpuBlock::WaitRegMem {
+                    poll_addr,
+                    is_memory,
+                    reference,
+                    mask,
+                    cmp,
+                };
+                if block.is_satisfied(mem, &self.register_file) {
+                    // Condition already true; proceed past this packet.
+                    tracing::trace!(?block, "gpu: WAIT_REG_MEM immediately satisfied");
+                } else {
+                    self.stats.wait_reg_mem_blocks += 1;
+                    tracing::debug!(?block, "gpu: WAIT_REG_MEM parking");
+                    self.pending_block = Some(block);
+                    return Type3Result::Blocked { rewind_to_header: true };
+                }
+            }
+            pm4::PM4_REG_RMW => {
+                // payload[0] = rmw_info (bit 31 = and_from_reg, bit 30 = or_from_reg)
+                // payload[1] = and mask (or register index)
+                // payload[2] = or mask (or register index)
+                let rmw_info = self.read_payload(mem, 1);
+                let and_or_reg = (rmw_info & 0x8000_0000) != 0;
+                let or_from_reg = (rmw_info & 0x4000_0000) != 0;
+                let reg_index = rmw_info & 0x1FFF;
+                let p2 = self.read_payload(mem, 2);
+                let p3 = self.read_payload(mem, 3);
+                let and_mask = if and_or_reg {
+                    self.register_file.read(p2 & 0x1FFF)
+                } else {
+                    p2
+                };
+                let or_mask = if or_from_reg {
+                    self.register_file.read(p3 & 0x1FFF)
+                } else {
+                    p3
+                };
+                let cur = self.register_file.read(reg_index);
+                let new_value = (cur & and_mask) | or_mask;
+                self.register_file.write(reg_index, new_value);
+                tracing::trace!(
+                    reg = format_args!("{reg_index:#x}"),
+                    cur = format_args!("{cur:#x}"),
+                    new = format_args!("{new_value:#x}"),
+                    "gpu: REG_RMW"
+                );
+            }
+            pm4::PM4_REG_TO_MEM => {
+                // payload[0] = reg_index, payload[1] = mem addr
+                let reg_index = self.read_payload(mem, 1) & 0x1FFF;
+                let dst = self.read_payload(mem, 2) & !3;
+                let value = self.register_file.read(reg_index);
+                mem.write_u32(dst, value);
+                tracing::trace!(
+                    reg = format_args!("{reg_index:#x}"),
+                    dst = format_args!("{dst:#010x}"),
+                    value = format_args!("{value:#x}"),
+                    "gpu: REG_TO_MEM"
+                );
+            }
+            pm4::PM4_MEM_WRITE => {
+                // payload[0] = dst, payload[1..=count-1] = values
+                let mut dst = self.read_payload(mem, 1) & !3;
+                for i in 2..=count {
+                    let val = self.read_payload(mem, i);
+                    mem.write_u32(dst, val);
+                    dst = dst.wrapping_add(4);
+                }
+            }
+            pm4::PM4_COND_WRITE => {
+                // payload[0] = wait_info, [1] = poll addr, [2] = ref, [3] = mask,
+                // [4] = write addr/reg, [5] = write data
+                let wait_info = self.read_payload(mem, 1);
+                let poll_raw = self.read_payload(mem, 2);
+                let reference = self.read_payload(mem, 3);
+                let mask = self.read_payload(mem, 4);
+                let is_memory = (wait_info & 0x10) != 0;
+                let cmp = WaitCmp::from_wait_info(wait_info);
+                let poll_addr = if is_memory { poll_raw & !3 } else { poll_raw };
+                let cur_raw = if is_memory {
+                    mem.read_u32(poll_addr)
+                } else {
+                    self.register_file.read(poll_addr)
+                };
+                if cmp.evaluate(cur_raw & mask, reference) {
+                    let write_addr = self.read_payload(mem, 5);
+                    let write_data = self.read_payload(mem, 6);
+                    if (wait_info & 0x100) != 0 {
+                        mem.write_u32(write_addr & !3, write_data);
+                    } else {
+                        self.register_file
+                            .write(write_addr & 0x1FFF, write_data);
+                    }
+                }
+            }
+            pm4::PM4_EVENT_WRITE => {
+                // payload[0] = initiator (written to VGT_EVENT_INITIATOR).
+                let initiator = self.read_payload(mem, 1);
+                self.register_file
+                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
+                self.handle_event_initiator(initiator & 0x3F, mem);
+                tracing::trace!(initiator = format_args!("{:#x}", initiator & 0x3F), "gpu: EVENT_WRITE");
+            }
+            pm4::PM4_EVENT_WRITE_SHD => {
+                // payload[0] = initiator (bit 31: write counter, else write `value`)
+                // payload[1] = address, payload[2] = value
+                let initiator = self.read_payload(mem, 1);
+                let address = self.read_payload(mem, 2);
+                let value = self.read_payload(mem, 3);
+                self.register_file
+                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
+                self.handle_event_initiator(initiator & 0x3F, mem);
+                let data = if (initiator & 0x8000_0000) != 0 {
+                    self.swap_counter as u32
+                } else {
+                    value
+                };
+                // M1.8: fenced write. The CPU thread busy-polls this
+                // address as a GPU completion fence. The Release fence
+                // emitted here pairs with `read_u32_fence`'s Acquire on
+                // the polling side: any earlier writes the worker
+                // performed (RPTR writeback, resolve target writes,
+                // etc.) are visible to the CPU once it sees the new
+                // fence value.
+                mem.write_u32_fence(address & !3, data);
+                tracing::trace!(
+                    addr = format_args!("{:#010x}", address & !3),
+                    value = format_args!("{data:#x}"),
+                    "gpu: EVENT_WRITE_SHD"
+                );
+            }
+            pm4::PM4_EVENT_WRITE_EXT => {
+                // payload[0] = initiator, [1] = address. Writes 6 u16 extents
+                // (min/max x/y/z) — we're not tracking scissors yet, so write zeros.
+                let initiator = self.read_payload(mem, 1);
+                let address = self.read_payload(mem, 2) & !3;
+                self.register_file
+                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
+                self.handle_event_initiator(initiator & 0x3F, mem);
+                for i in 0..6u32 {
+                    mem.write_u16(address + i * 2, 0);
+                }
+            }
+            pm4::PM4_EVENT_WRITE_ZPD => {
+                // Occlusion query writeback — always write zeros (no query).
+                let initiator = self.read_payload(mem, 1);
+                self.register_file
+                    .write(reg::VGT_EVENT_INITIATOR, initiator & 0x3F);
+                self.handle_event_initiator(initiator & 0x3F, mem);
+            }
+            pm4::PM4_DRAW_INDX | pm4::PM4_DRAW_INDX_2 => {
+                self.stats.draws_seen += 1;
+                // Canary (`pm4_command_processor_implement.h:1128-1151`):
+                //   DRAW_INDX: payload[0] = viz_query, payload[1] = vgt_draw_initiator,
+                //              [2] = dma_base (if source=DMA), [3] = dma_size
+                //   DRAW_INDX_2: payload[0] = vgt_draw_initiator (indices follow inline).
+                let (vgt, dma_base, dma_size) = if opcode == pm4::PM4_DRAW_INDX {
+                    let _viz = self.read_payload(mem, 1);
+                    let vgt = self.read_payload(mem, 2);
+                    let (db, ds) = if count >= 4 {
+                        (Some(self.read_payload(mem, 3)), Some(self.read_payload(mem, 4)))
+                    } else {
+                        (None, None)
+                    };
+                    (vgt, db, ds)
+                } else {
+                    (self.read_payload(mem, 1), None, None)
+                };
+                let mut ds = draw_state::extract(&self.register_file, vgt, dma_base, dma_size);
+                ds.vs_blob_key = self.active_vs_key;
+                ds.ps_blob_key = self.active_ps_key;
+                let processed = primitive::process(ds.primitive, ds.vertex_count, None);
+                metrics::counter!(
+                    "gpu.draw",
+                    "prim" => format!("{:?}", ds.primitive),
+                )
+                .increment(1);
+                if processed.rejected {
+                    metrics::counter!("gpu.draw.rejected").increment(1);
+                }
+                // P4: update the render-target cache with every bound RT
+                // from this draw. Each bind either inserts a new key or
+                // refreshes an existing descriptor's bind_count. `msaa` is
+                // still hardcoded to 1× because we don't yet decode
+                // `PA_SC_AA_CONFIG`; P4b can add that.
+                let msaa = crate::render_target_cache::MsaaSamples::X1;
+                let mut viewport_height = ds.viewport.scale_y.abs() * 2.0;
+                if viewport_height <= 0.0 {
+                    viewport_height = 720.0;
+                }
+                // 16 samples per tile row (64-sample 8×8 macroblocks pack
+                // 16 vertical samples per EDRAM tile).
+                let rows_of_tiles = (viewport_height as u32).div_ceil(16);
+                for (i, ci_opt) in ds.color_info.iter().enumerate() {
+                    if let Some(ci) = ci_opt {
+                        let pitch32 = ds.scissor.br_x.div_ceil(32);
+                        let key = crate::render_target_cache::RenderTargetKey {
+                            base_tiles: ci.base_tiles,
+                            pitch_tiles_at_32bpp: pitch32,
+                            msaa_samples: msaa,
+                            is_depth: false,
+                            resource_format: ci.format & 0xF,
+                        };
+                        let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
+                        self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
+                        metrics::counter!(
+                            "gpu.rt.bind",
+                            "slot" => format!("{i}"),
+                            "fmt" => format!("{}", ci.format & 0xF),
+                        )
+                        .increment(1);
+                    }
+                }
+                if let Some(depth) = ds.depth_info {
+                    let pitch32 = ds.scissor.br_x.div_ceil(32);
+                    let key = crate::render_target_cache::RenderTargetKey {
+                        base_tiles: depth.base_tiles,
+                        pitch_tiles_at_32bpp: pitch32,
+                        msaa_samples: msaa,
+                        is_depth: true,
+                        resource_format: depth.format & 0xF,
+                    };
+                    let rt_idx = self.rt_cache.bind(key, self.stats.draws_seen as u32);
+                    self.rt_cache.claim_tiles(rt_idx, rows_of_tiles as u16);
+                }
+                self.stats.unique_render_targets = self.rt_cache.len() as u64;
+                tracing::debug!(
+                    opcode = format_args!("{opcode:#x}"),
+                    prim = ?ds.primitive,
+                    verts = ds.vertex_count,
+                    ?processed.topology,
+                    rewritten = processed.rewritten_indices.is_some(),
+                    "gpu: DRAW_INDX captured"
+                );
+                self.last_draw = Some(ds);
+                self.last_primitive = Some(processed);
+            }
+            pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
+                // payload[0] = offset_type — bits[10:0] index, bits[23:16] type
+                let offset_type = self.read_payload(mem, 1);
+                let index = offset_type & 0x7FF;
+                let const_type = (offset_type >> 16) & 0xFF;
+                let base = match const_type {
+                    0 => CONST_BASE_ALU,
+                    1 => CONST_BASE_FETCH,
+                    2 => CONST_BASE_BOOL,
+                    3 => CONST_BASE_LOOP,
+                    4 => CONST_BASE_REGISTERS,
+                    _ => CONST_BASE_ALU, // defensive default
+                };
+                for i in 0..(count - 1) {
+                    let v = self.read_payload(mem, 2 + i);
+                    self.register_file.write(base + index + i, v);
+                }
+            }
+            pm4::PM4_SET_CONSTANT2 => {
+                // payload[0] = 16-bit index; subsequent payloads write consecutive regs.
+                let index = self.read_payload(mem, 1) & 0xFFFF;
+                for i in 0..(count - 1) {
+                    let v = self.read_payload(mem, 2 + i);
+                    self.register_file.write(index + i, v);
+                }
+            }
+            pm4::PM4_LOAD_ALU_CONSTANT => {
+                // payload[0] = source mem addr, [1] = offset_type, [2] = size_dwords
+                let src = self.read_payload(mem, 1) & !3;
+                let offset_type = self.read_payload(mem, 2);
+                let size_dwords = self.read_payload(mem, 3);
+                let index = offset_type & 0x7FF;
+                let const_type = (offset_type >> 16) & 0xFF;
+                let base = match const_type {
+                    0 => CONST_BASE_ALU,
+                    1 => CONST_BASE_FETCH,
+                    2 => CONST_BASE_BOOL,
+                    3 => CONST_BASE_LOOP,
+                    4 => CONST_BASE_REGISTERS,
+                    _ => CONST_BASE_ALU,
+                };
+                for i in 0..size_dwords {
+                    let v = mem.read_u32(src + i * 4);
+                    self.register_file.write(base + index + i, v);
+                }
+            }
+            pm4::PM4_IM_LOAD | pm4::PM4_IM_LOAD_IMMEDIATE => {
+                // Canary (pm4_command_processor_implement.h:1271-1330):
+                //   IM_LOAD payload: [0] addr_type, [1] start_size
+                //   IM_LOAD_IMMEDIATE payload: [0] shader_type, [1] start_size, [2..count] = microcode
+                let shader_type = self.read_payload(mem, 1) as u8 & 0x3;
+                let start_size = self.read_payload(mem, 2);
+                let size_dwords = start_size & 0xFFFF;
+                let blob = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
+                    let mut v = Vec::with_capacity(size_dwords as usize);
+                    for i in 0..size_dwords {
+                        v.push(self.read_payload(mem, 3 + i));
+                    }
+                    v
+                } else {
+                    let addr = self.read_payload(mem, 1) & !3;
+                    let mut v = Vec::with_capacity(size_dwords as usize);
+                    for i in 0..size_dwords {
+                        v.push(mem.read_u32(addr + i * 4));
+                    }
+                    v
+                };
+                // For IM_LOAD the payload already carries an address that
+                // uniquely identifies this shader in guest memory, so the
+                // full `addr_type` dword (address | stage bits) makes a
+                // good cache key. For IM_LOAD_IMMEDIATE payload[0] is just
+                // the 2-bit shader_type — without a content-derived key
+                // every immediate upload would collide on 0 or 1 and
+                // thrash a single slot. Fold the microcode through a
+                // stable FNV-1a hash so per-content dedup still works.
+                let key = if opcode == pm4::PM4_IM_LOAD_IMMEDIATE {
+                    fnv1a_u32_dwords(shader_type as u32, &blob)
+                } else {
+                    self.read_payload(mem, 1)
+                };
+                self.insert_shader_blob(
+                    key,
+                    ShaderBlob {
+                        shader_type,
+                        dwords: blob,
+                    },
+                );
+                // P3b M1: record which blob is now "active" for the
+                // current stage. The uber-shader dispatch (xenia-ui) reads
+                // `active_vs_key`/`active_ps_key` at draw time to upload
+                // the right microcode. `shader_type`: 0 = vertex, 1 = pixel
+                // (per Xenos `ShaderType`).
+                match shader_type {
+                    0 => self.active_vs_key = Some(key),
+                    1 => self.active_ps_key = Some(key),
+                    _ => {}
+                }
+                metrics::counter!(
+                    "gpu.shader.blob_seen",
+                    "stage" => if shader_type == 0 { "vs" } else { "ps" },
+                )
+                .increment(1);
+                tracing::debug!(
+                    shader_type,
+                    size_dwords,
+                    key = format_args!("{key:#x}"),
+                    "gpu: IM_LOAD (shader blob cached)"
+                );
+            }
+            pm4::PM4_SET_BIN_MASK_LO => {
+                self.bin_mask = (self.bin_mask & 0xFFFF_FFFF_0000_0000)
+                    | (self.read_payload(mem, 1) as u64);
+            }
+            pm4::PM4_SET_BIN_MASK_HI => {
+                self.bin_mask = (self.bin_mask & 0x0000_0000_FFFF_FFFF)
+                    | ((self.read_payload(mem, 1) as u64) << 32);
+            }
+            pm4::PM4_SET_BIN_MASK => {
+                let lo = self.read_payload(mem, 1) as u64;
+                let hi = self.read_payload(mem, 2) as u64;
+                self.bin_mask = (hi << 32) | lo;
+            }
+            pm4::PM4_SET_BIN_SELECT_LO => {
+                self.bin_select = (self.bin_select & 0xFFFF_FFFF_0000_0000)
+                    | (self.read_payload(mem, 1) as u64);
+            }
+            pm4::PM4_SET_BIN_SELECT_HI => {
+                self.bin_select = (self.bin_select & 0x0000_0000_FFFF_FFFF)
+                    | ((self.read_payload(mem, 1) as u64) << 32);
+            }
+            pm4::PM4_SET_BIN_SELECT => {
+                let lo = self.read_payload(mem, 1) as u64;
+                let hi = self.read_payload(mem, 2) as u64;
+                self.bin_select = (hi << 32) | lo;
+            }
+            pm4::PM4_INTERRUPT => {
+                let cpu_mask = self.read_payload(mem, 1);
+                self.stats.interrupts_emitted += 1;
+                self.pending_interrupts.push(PendingInterrupt {
+                    source: InterruptSource::CommandProcessor,
+                    cpu_mask,
+                });
+                tracing::debug!(
+                    cpu_mask = format_args!("{cpu_mask:#x}"),
+                    "gpu: PM4_INTERRUPT queued"
+                );
+            }
+            pm4::PM4_XE_SWAP => {
+                // Payload: [0] signature, [1] frontbuffer_phys, [2] width, [3] height
+                let _signature = self.read_payload(mem, 1);
+                let frontbuffer_phys = self.read_payload(mem, 2);
+                let width = self.read_payload(mem, 3);
+                let height = self.read_payload(mem, 4);
+                self.notify_xe_swap(frontbuffer_phys, width, height);
+            }
+            _ => {
+                // Unknown opcode — log once per opcode but don't stall.
+                tracing::warn!(
+                    opcode = format_args!("{opcode:#x}"),
+                    count,
+                    "gpu: unhandled Type3 opcode"
+                );
+            }
+        }
+        Type3Result::Consumed(total_dwords)
+    }
+
+    /// Read dword at payload-relative offset `i` (where `i=0` is the header).
+    fn read_payload(&self, mem: &dyn MemoryAccess, i: u32) -> u32 {
+        let addr = self.ring.addr_at_offset(i).unwrap();
+        mem.read_u32(addr)
+    }
+
+    /// Drain up to `max_packets` (used by the kernel's VdSwap handler when we
+    /// don't yet have MMIO-triggered draining). Returns the number of
+    /// packets consumed.
+    pub fn drain(&mut self, mem: &dyn MemoryAccess, max_packets: u32) -> u32 {
+        let mut n = 0;
+        for _ in 0..max_packets {
+            match self.execute_one(mem) {
+                ExecOutcome::Stepped { .. } => n += 1,
+                ExecOutcome::Idle | ExecOutcome::Blocked => break,
+            }
+        }
+        n
+    }
+}
+
+impl Default for GpuSystem {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Subset of Xenos registers we reference by name. Full table at
+/// `xenia-canary/src/xenia/gpu/registers.h`.
+pub mod reg {
+    //! All values below are Xenos *register indices* (the number you find in
+    //! canary's `register_table.inc`, i.e. the byte offset within the
+    //! aperture divided by 4). The MMIO aperture at `0x7FC8_0000` maps each
+    //! register at `APERTURE_BASE + index * 4`; the MMIO callbacks recover
+    //! the index with `(addr & 0xFFFF) / 4` before matching against these
+    //! constants.
+
+    /// `XE_GPU_REG_CP_RB_BASE` — ring buffer base address.
+    pub const CP_RB_BASE: u32 = 0x01C0;
+    /// `XE_GPU_REG_CP_RB_CNTL` — ring buffer control.
+    pub const CP_RB_CNTL: u32 = 0x01C1;
+    /// `XE_GPU_REG_CP_RB_RPTR_ADDR` — where to mirror read pointer.
+    pub const CP_RB_RPTR_ADDR: u32 = 0x01C3;
+    /// `XE_GPU_REG_CP_RB_RPTR` — read pointer (GPU → CPU).
+    pub const CP_RB_RPTR: u32 = 0x01C4;
+    /// `XE_GPU_REG_CP_RB_WPTR` — write pointer (CPU → GPU); MMIO side effect.
+    pub const CP_RB_WPTR: u32 = 0x01C5;
+    /// `XE_GPU_REG_CP_INT_STATUS` — interrupt status bits.
+    pub const CP_INT_STATUS: u32 = 0x01F3;
+    /// `XE_GPU_REG_CP_INT_ACK` — write-to-ack interrupt bits.
+    pub const CP_INT_ACK: u32 = 0x01F4;
+    /// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
+    /// Bit 0 = VBLANK_INT_OCCURRED.
+    pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
+    /// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
+    pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
+    /// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
+    /// (Canary `register_table.inc:530`).
+    pub const COHER_STATUS_HOST: u32 = 0x0A31;
+}
+
+/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a
+/// stable, collision-resistant cache key for `IM_LOAD_IMMEDIATE` shader
+/// blobs (where the guest supplies no natural address to key on).
+fn fnv1a_u32_dwords(seed: u32, dwords: &[u32]) -> u32 {
+    const FNV_OFFSET: u32 = 0x811C_9DC5;
+    const FNV_PRIME: u32 = 0x0100_0193;
+    let mut hash = FNV_OFFSET;
+    for byte in seed.to_le_bytes() {
+        hash ^= byte as u32;
+        hash = hash.wrapping_mul(FNV_PRIME);
+    }
+    for dw in dwords {
+        for byte in dw.to_le_bytes() {
+            hash ^= byte as u32;
+            hash = hash.wrapping_mul(FNV_PRIME);
+        }
+    }
+    hash
+}
+
+/// Internal Type-3 handler result. Distinguishes "consumed a packet (by N
+/// dwords)" from "blocked; don't advance read ptr".
+enum Type3Result {
+    Consumed(u32),
+    Blocked { rewind_to_header: bool },
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use xenia_memory::GuestMemory;
+    use xenia_memory::page_table::MemoryProtect;
+
+    fn build_mem() -> GuestMemory {
+        let mut mem = GuestMemory::new().unwrap();
+        let rw = MemoryProtect::READ | MemoryProtect::WRITE;
+        mem.alloc(0x4000_0000, 0x4000, rw).unwrap();
+        mem
+    }
+
+    #[test]
+    fn ready_when_ring_has_pending() {
+        let mut gpu = GpuSystem::new();
+        let mem = build_mem();
+        assert!(!gpu.is_ready(&mem));
+        gpu.initialize_ring_buffer(0x4000_0000, 10); // 1024 bytes = 256 dwords
+        assert!(!gpu.is_ready(&mem));
+        gpu.extend_write_ptr(4);
+        assert!(gpu.is_ready(&mem));
+    }
+
+    #[test]
+    fn type2_nop_advances_read_pointer() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        // 256 dwords ring at 0x40000000
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // Push 3 Type-2 NOPs
+        for i in 0..3u32 {
+            mem.write_u32(0x4000_0000 + i * 4, 0x8000_0000);
+        }
+        gpu.extend_write_ptr(3);
+        for _ in 0..3 {
+            match gpu.execute_one(&mut mem) {
+                ExecOutcome::Stepped { dwords_consumed } => assert_eq!(dwords_consumed, 1),
+                other => panic!("unexpected {:?}", other),
+            }
+        }
+        assert_eq!(gpu.ring.read_offset_dwords, 3);
+        assert_eq!(gpu.stats.packets_executed, 3);
+    }
+
+    #[test]
+    fn type0_reg_run_writes_register_file() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // Type0 header: 2 dwords, base_index=0x100 → write_one=0 → count field = 1 (count-1)
+        let hdr = (1u32 << 16) | 0x100;
+        mem.write_u32(0x4000_0000, hdr);
+        mem.write_u32(0x4000_0004, 0xDEAD_BEEF);
+        mem.write_u32(0x4000_0008, 0xCAFE_BABE);
+        gpu.extend_write_ptr(3);
+        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
+        assert_eq!(gpu.register_file.read(0x100), 0xDEAD_BEEF);
+        assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE);
+    }
+
+    #[test]
+    fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // WAIT_REG_MEM: wait until *0x40001000 == 0x42
+        // header
+        let hdr = (3u32 << 30) | ((5u32 - 1) << 16) | ((pm4::PM4_WAIT_REG_MEM as u32) << 8);
+        mem.write_u32(0x4000_0000, hdr);
+        // wait_info: is_memory=1 (bit 4), cmp=equal (bits 2:0 = 2)
+        mem.write_u32(0x4000_0004, 0x12);
+        mem.write_u32(0x4000_0008, 0x4000_1000);
+        mem.write_u32(0x4000_000C, 0x42);
+        mem.write_u32(0x4000_0010, 0xFFFF_FFFF);
+        mem.write_u32(0x4000_0014, 0);
+        gpu.extend_write_ptr(6);
+        // First exec: poll addr reads 0 → blocked.
+        assert_eq!(gpu.execute_one(&mut mem), ExecOutcome::Blocked);
+        assert_eq!(gpu.ring.read_offset_dwords, 0, "read ptr must not advance while blocked");
+        // Make the wait satisfied.
+        mem.write_u32(0x4000_1000, 0x42);
+        match gpu.execute_one(&mut mem) {
+            ExecOutcome::Stepped { dwords_consumed } => {
+                // The WAIT_REG_MEM packet is 1 (header) + 5 (count) = 6 dwords.
+                assert_eq!(dwords_consumed, 6);
+            }
+            other => panic!("expected Stepped after wait satisfied, got {:?}", other),
+        }
+        assert_eq!(gpu.ring.read_offset_dwords, 6);
+    }
+
+    #[test]
+    fn mem_write_writes_all_payload_dwords() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // MEM_WRITE: count=3 → 1 header + 1 dst + 2 data
+        let hdr = (3u32 << 30) | ((3u32 - 1) << 16) | ((pm4::PM4_MEM_WRITE as u32) << 8);
+        mem.write_u32(0x4000_0000, hdr);
+        mem.write_u32(0x4000_0004, 0x4000_1000); // dst
+        mem.write_u32(0x4000_0008, 0x1111_1111);
+        mem.write_u32(0x4000_000C, 0x2222_2222);
+        gpu.extend_write_ptr(4);
+        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
+        assert_eq!(mem.read_u32(0x4000_1000), 0x1111_1111);
+        assert_eq!(mem.read_u32(0x4000_1004), 0x2222_2222);
+    }
+
+    #[test]
+    fn mmio_write_to_cp_rb_wptr_reflects_into_ring() {
+        use std::sync::atomic::Ordering;
+        let mut gpu = GpuSystem::new();
+        let mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // Guest writes wptr=8 via MMIO.
+        gpu.mmio.cp_rb_wptr.store(8, Ordering::Relaxed);
+        // Before sync, ring has no pending work.
+        assert!(!gpu.is_ready(&mem));
+        gpu.sync_with_mmio();
+        assert_eq!(gpu.ring.write_offset_dwords, 8);
+        assert!(gpu.is_ready(&mem));
+        // After sync, rptr is mirrored back to mmio for the guest to read.
+        assert_eq!(gpu.mmio.cp_rb_rptr.load(Ordering::Relaxed), 0);
+    }
+
+    /// End-to-end: feed two DRAW_INDX_2 packets through `execute_one` and
+    /// verify the GPU system reports the expected `draws_seen` / `last_draw`
+    /// state that the UI's Xenos pipeline consumes. Acts as the "draw
+    /// dispatch integration" check mentioned in the P3 verification plan.
+    #[test]
+    fn successive_draws_accumulate_in_stats() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        let mk_draw = |addr: u32, vgt: u32, mem: &xenia_memory::GuestMemory| {
+            let hdr = (3u32 << 30) | ((1u32 - 1) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
+            mem.write_u32(addr, hdr);
+            mem.write_u32(addr + 4, vgt);
+        };
+        // Draw #1: TriangleList, 6 verts.
+        mk_draw(0x4000_0000, (6u32 << 16) | (2 << 6) | 4, &mut mem);
+        // Draw #2: TriangleStrip, 4 verts.
+        mk_draw(0x4000_0008, (4u32 << 16) | (2 << 6) | 6, &mut mem);
+        gpu.extend_write_ptr(4);
+        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
+        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
+        assert_eq!(gpu.stats.draws_seen, 2);
+        let ds = gpu.last_draw.expect("last_draw set");
+        assert_eq!(ds.primitive, crate::draw_state::PrimitiveType::TriangleStrip);
+        assert_eq!(ds.vertex_count, 4);
+    }
+
+    #[test]
+    fn draw_indx_2_captures_last_draw() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // DRAW_INDX_2 packet with 1 payload dword = vgt_draw_initiator:
+        //   prim=4 (TriangleList), source=2 (auto), count=3 verts.
+        let vgt = (3u32 << 16) | (2 << 6) | 4;
+        let hdr = (3u32 << 30) | ((1u32 - 0) << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
+        // count field in header = (total_dwords_after - 1); we have 1 payload = count=1 → encoded 0.
+        let hdr = (hdr & !0x3FFF_0000) | ((1u32 - 1) << 16);
+        mem.write_u32(0x4000_0000, hdr);
+        mem.write_u32(0x4000_0004, vgt);
+        gpu.extend_write_ptr(2);
+        assert!(matches!(
+            gpu.execute_one(&mut mem),
+            ExecOutcome::Stepped { .. }
+        ));
+        assert_eq!(gpu.stats.draws_seen, 1);
+        let ds = gpu.last_draw.expect("last_draw set");
+        assert_eq!(
+            ds.primitive,
+            crate::draw_state::PrimitiveType::TriangleList
+        );
+        assert_eq!(ds.vertex_count, 3);
+        let p = gpu.last_primitive.as_ref().expect("last_primitive set");
+        assert_eq!(p.topology, crate::primitive::HostTopology::TriangleList);
+        assert!(!p.rejected);
+    }
+
+    /// P3b M1: IM_LOAD_IMMEDIATE must set `active_vs_key` / `active_ps_key`
+    /// based on `shader_type`, and a subsequent DRAW_INDX must carry those
+    /// P8: shader-blob FIFO evicts the oldest non-active blob when the
+    /// cache crosses `SHADER_BLOB_CAP`. Active keys are protected.
+    #[test]
+    fn shader_blob_cap_evicts_oldest() {
+        let mut gpu = GpuSystem::new();
+        gpu.active_vs_key = Some(u32::MAX);
+        // Insert unique keys (starting at 1_000 to avoid colliding with
+        // the active-key sentinel) up to `CAP + 10`; every insert fires
+        // the eviction path once len > CAP.
+        gpu.insert_shader_blob(
+            u32::MAX,
+            ShaderBlob {
+                shader_type: 0,
+                dwords: vec![0xAA; 4],
+            },
+        );
+        let first_key = 1_000u32;
+        for k in first_key..(first_key + SHADER_BLOB_CAP as u32 + 10) {
+            gpu.insert_shader_blob(
+                k,
+                ShaderBlob {
+                    shader_type: 0,
+                    dwords: vec![k; 2],
+                },
+            );
+        }
+        assert!(gpu.shader_blobs.len() <= SHADER_BLOB_CAP);
+        // Active key (u32::MAX) must still be present.
+        assert!(gpu.shader_blobs.contains_key(&u32::MAX));
+        // Earliest non-active key must have been evicted (at least one of
+        // the first 10 we inserted is gone).
+        let evicted = (first_key..first_key + 10)
+            .filter(|k| !gpu.shader_blobs.contains_key(k))
+            .count();
+        assert!(
+            evicted > 0,
+            "expected at least one of the first 10 keys to be evicted, \
+             got shader_blobs.len() = {}",
+            gpu.shader_blobs.len()
+        );
+    }
+
+    /// `IM_LOAD_IMMEDIATE` uploads a vertex + pixel shader inline; draw
+    /// state must then carry whichever keys the executor minted. With the
+    /// content-hashed key scheme, vs and ps keys differ because their
+    /// microcode bytes differ — the concrete values are derived, so the
+    /// test just asserts both are non-zero and not equal.
+    #[test]
+    fn im_load_records_active_blob_and_draw_carries_it() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+
+        // PM4_IM_LOAD_IMMEDIATE VS: 4 data dwords = shader_type + start_size
+        // + 2 code. Header count field = data_count - 1 = 3.
+        let hdr_vs = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
+        mem.write_u32(0x4000_0000, hdr_vs);
+        mem.write_u32(0x4000_0004, 0); // shader_type=0 (vertex)
+        mem.write_u32(0x4000_0008, 2); // start_size: size=2
+        mem.write_u32(0x4000_000C, 0xAAAA_AAAA);
+        mem.write_u32(0x4000_0010, 0xBBBB_BBBB);
+
+        // Second IM_LOAD_IMMEDIATE PS (shader_type=1); 5 dwords total.
+        let hdr_ps = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
+        mem.write_u32(0x4000_0014, hdr_ps);
+        mem.write_u32(0x4000_0018, 1); // shader_type=1 (pixel)
+        mem.write_u32(0x4000_001C, 2);
+        mem.write_u32(0x4000_0020, 0xCCCC_CCCC);
+        mem.write_u32(0x4000_0024, 0xDDDD_DDDD);
+
+        // DRAW_INDX_2: 1 data dword, count field = 0.
+        let vgt = (3u32 << 16) | (2 << 6) | 4;
+        let hdr_draw = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
+        mem.write_u32(0x4000_0028, hdr_draw);
+        mem.write_u32(0x4000_002C, vgt);
+
+        // Total dwords consumed by the 3 packets: 5 + 5 + 2 = 12.
+        gpu.extend_write_ptr(12);
+        // Drain all three packets.
+        for _ in 0..3 {
+            assert!(matches!(
+                gpu.execute_one(&mut mem),
+                ExecOutcome::Stepped { .. }
+            ));
+        }
+        let vs_key = gpu.active_vs_key.expect("vs key set by IM_LOAD_IMMEDIATE");
+        let ps_key = gpu.active_ps_key.expect("ps key set by IM_LOAD_IMMEDIATE");
+        assert_ne!(vs_key, ps_key, "VS and PS keys must be distinct");
+        let ds = gpu.last_draw.expect("DRAW_INDX_2 captured");
+        assert_eq!(ds.vs_blob_key, Some(vs_key));
+        assert_eq!(ds.ps_blob_key, Some(ps_key));
+    }
+
+    /// Regression: before the content-hash keying, two distinct vertex
+    /// shaders uploaded via `IM_LOAD_IMMEDIATE` both mapped to key `0`
+    /// (the shader_type dword) and overwrote each other in `shader_blobs`.
+    /// With FNV-1a over the microcode, different blobs get different keys
+    /// and the cache retains both.
+    #[test]
+    fn im_load_immediate_distinct_microcode_does_not_collide() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+
+        let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_IM_LOAD_IMMEDIATE as u32) << 8);
+
+        // VS shader A.
+        mem.write_u32(0x4000_0000, hdr);
+        mem.write_u32(0x4000_0004, 0); // shader_type = vertex
+        mem.write_u32(0x4000_0008, 2); // size = 2
+        mem.write_u32(0x4000_000C, 0x1111_1111);
+        mem.write_u32(0x4000_0010, 0x2222_2222);
+
+        // VS shader B — same stage, different microcode.
+        mem.write_u32(0x4000_0014, hdr);
+        mem.write_u32(0x4000_0018, 0);
+        mem.write_u32(0x4000_001C, 2);
+        mem.write_u32(0x4000_0020, 0x3333_3333);
+        mem.write_u32(0x4000_0024, 0x4444_4444);
+
+        gpu.extend_write_ptr(10);
+        for _ in 0..2 {
+            assert!(matches!(
+                gpu.execute_one(&mut mem),
+                ExecOutcome::Stepped { .. }
+            ));
+        }
+        assert_eq!(
+            gpu.shader_blobs.len(),
+            2,
+            "two distinct VS shaders must not collide on the same cache key"
+        );
+    }
+
+    /// P4: `EVENT_WRITE` with initiator `15` (TILE_FLUSH) must route
+    /// through the resolve handler — captured `last_resolve` + incremented
+    /// `stats.resolves_total` proves the dispatch works.
+    #[test]
+    fn tile_flush_event_records_resolve() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // Pre-seed RB_COPY_DEST_BASE / DEST_PITCH / DEST_INFO so
+        // ResolveInfo captures recognisable values.
+        gpu.register_file
+            .write(draw_state::reg::RB_COPY_DEST_BASE, 0xDEAD_0000);
+        gpu.register_file.write(
+            draw_state::reg::RB_COPY_DEST_PITCH,
+            (720u32 << 16) | 1280u32,
+        );
+        // copy_dest_format=6 (k_8_8_8_8), copy_dest_endian=0.
+        gpu.register_file
+            .write(draw_state::reg::RB_COPY_DEST_INFO, 6u32 << 7);
+        gpu.register_file.write(
+            draw_state::reg::RB_COPY_CONTROL,
+            (1u32 << 20) /* copy_command=1 */ | (1u32 << 8), /* color_clear_enable */
+        );
+
+        // PM4_EVENT_WRITE: 1 data dword — the initiator.
+        let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_EVENT_WRITE as u32) << 8);
+        mem.write_u32(0x4000_0000, hdr);
+        mem.write_u32(0x4000_0004, 15); // TILE_FLUSH
+        gpu.extend_write_ptr(2);
+        assert!(matches!(
+            gpu.execute_one(&mut mem),
+            ExecOutcome::Stepped { .. }
+        ));
+        assert_eq!(gpu.stats.resolves_total, 1);
+        let info = gpu.last_resolve.expect("TILE_FLUSH captured resolve");
+        // `0xDEAD_0000 & 0x1FFF_FFFF = 0x1EAD_0000` — `dest_base` is now
+        // masked to the Xenon 29-bit physical range at decode time.
+        assert_eq!(info.dest_base, 0x1EAD_0000);
+        assert_eq!(info.dest_pitch_pixels, 1280);
+        assert_eq!(info.dest_height_pixels, 720);
+        assert_eq!(info.dest_format, 6);
+        assert_eq!(info.copy_command, 1);
+        assert!(info.color_clear_enable);
+    }
+
+    /// P4: DRAW_INDX* with a bound color target should populate
+    /// `rt_cache` so downstream stages (HUD, resolve) can look up the RT.
+    #[test]
+    fn draw_indx_populates_rt_cache() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        // color0 enabled + format=2 (k_8_8_8_8 or similar), base=0x10.
+        gpu.register_file.write(draw_state::reg::RB_MODECONTROL, 0x1);
+        gpu.register_file
+            .write(draw_state::reg::RB_COLOR_INFO_0, (2u32 << 16) | 0x10);
+        // Non-zero scissor so pitch32 calc is meaningful.
+        gpu.register_file.write(
+            draw_state::reg::PA_SC_WINDOW_SCISSOR_BR,
+            (720u32 << 16) | 1280u32,
+        );
+        let vgt = (3u32 << 16) | (2 << 6) | 4;
+        let hdr = (3u32 << 30) | (0u32 << 16) | ((pm4::PM4_DRAW_INDX_2 as u32) << 8);
+        mem.write_u32(0x4000_0000, hdr);
+        mem.write_u32(0x4000_0004, vgt);
+        gpu.extend_write_ptr(2);
+        assert!(matches!(
+            gpu.execute_one(&mut mem),
+            ExecOutcome::Stepped { .. }
+        ));
+        assert_eq!(gpu.rt_cache.len(), 1);
+        assert_eq!(gpu.stats.unique_render_targets, 1);
+    }
+
+    #[test]
+    fn xe_swap_records_notification() {
+        let mut gpu = GpuSystem::new();
+        let mut mem = build_mem();
+        gpu.initialize_ring_buffer(0x4000_0000, 10);
+        let hdr = (3u32 << 30) | ((4u32 - 1) << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
+        mem.write_u32(0x4000_0000, hdr);
+        mem.write_u32(0x4000_0004, pm4::SWAP_SIGNATURE);
+        mem.write_u32(0x4000_0008, 0xCAFE_0000);
+        mem.write_u32(0x4000_000C, 1280);
+        mem.write_u32(0x4000_0010, 720);
+        gpu.extend_write_ptr(5);
+        assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
+        let swap = gpu.last_swap.unwrap();
+        assert_eq!(swap.frame_index, 1);
+        assert_eq!(swap.frontbuffer_phys, 0xCAFE_0000);
+        assert_eq!(swap.width, 1280);
+        assert_eq!(swap.height, 720);
+        assert_eq!(gpu.stats.swaps_seen, 1);
+    }
+}
diff --git a/crates/xenia-gpu/src/handle.rs b/crates/xenia-gpu/src/handle.rs
new file mode 100644
index 0000000..4205f77
--- /dev/null
+++ b/crates/xenia-gpu/src/handle.rs
@@ -0,0 +1,1010 @@
+//! GPU thread skeleton — types only, no thread spawned yet.
+//!
+//! M1 step 1 of the concurrency rollout (see
+//! `/home/fabi/.claude/plans/good-plese-implement-in-zesty-hickey.md`).
+//! This module introduces the **shapes** that the eventual GPU host thread
+//! will use, without changing any runtime behavior:
+//!
+//! - [`GpuCommand`] — the control-plane RPC enum sent CPU→GPU.
+//! - [`GpuHandle`] — the CPU-side proxy: command sender + cloned MMIO
+//!   atomics + an interrupt receiver. Eventually it'll also carry the
+//!   worker thread's `JoinHandle`.
+//! - [`GpuWorker`] — the GPU-side owned state (the `GpuSystem` itself plus
+//!   the receive end of the command channel and the sender for interrupts).
+//!   It does not yet have a `run()` method; that lands in step 4.
+//!
+//! The construction is done via [`GpuSystem::into_handle`], which splits a
+//! freshly-built `GpuSystem` into `(GpuWorker, GpuHandle)`. The worker keeps
+//! the actual GPU state plus `cmd_rx`/`int_tx`; the handle carries `cmd_tx`,
+//! `int_rx`, and clones of the `Arc<AtomicU32>` MMIO mailboxes so the CPU
+//! producer side can write WPTR / read RPTR without going through the
+//! channel.
+//!
+//! Until step 4 wires the worker into a real thread, no caller invokes
+//! `into_handle` on the live `KernelState.gpu` — the constructor exists for
+//! the unit test below and for the synthetic-test path.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::thread::{self, JoinHandle};
+use std::time::{Duration, Instant};
+
+use crossbeam_channel::{Receiver, Sender, bounded, unbounded};
+
+use xenia_memory::GuestMemory;
+
+use crate::gpu_system::{ExecOutcome, GpuMmio, GpuStats, GpuSystem, PendingInterrupt};
+
+/// Reply channel for a [`GpuCommand::DrainFence`]. Single-shot
+/// `bounded(1)` — the GPU sends `()` once it's drained the ring up to the
+/// requested wptr, the CPU thread blocks on `recv` until then. M1 step 5
+/// is the first user of this; step 1 only validates the type fits.
+pub type DrainReply = crossbeam_channel::Sender<()>;
+
+/// Control-plane RPC the CPU thread sends to the GPU thread. Data-plane
+/// signals (WPTR/RPTR/INT_STATUS) ride atomic mailboxes instead — see
+/// [`GpuMmio`]. Channels are for events that need ordered delivery and
+/// (sometimes) a reply.
+#[derive(Debug)]
+pub enum GpuCommand {
+    /// `VdInitializeRingBuffer(base, size_log2)`. The kernel hands the GPU
+    /// the guest-physical base address and dword-size of the primary ring.
+    /// Today's [`GpuSystem::initialize_ring_buffer`] does this synchronously;
+    /// from step 4 onward the kernel sends this command instead.
+    InitializeRing {
+        base: u32,
+        size_log2: u32,
+    },
+    /// `VdEnableRingBufferRPtrWriteBack(addr, block_size)`. The kernel
+    /// supplies a guest-memory address into which the GPU should mirror its
+    /// internal `read_offset_dwords` after each packet.
+    EnableRptrWriteback {
+        addr: u32,
+        block_size_log2: u32,
+    },
+    /// Block until the GPU's read pointer has caught up with the supplied
+    /// `target_wptr` (in dwords). The reply is sent on `reply_tx` once the
+    /// drain completes (or the worker hits its internal deadline). Used by
+    /// `vd_swap` to preserve the synchronous "GPU has caught up to the
+    /// guest's wptr at swap time" semantics.
+    DrainFence {
+        target_wptr: u32,
+        reply_tx: DrainReply,
+    },
+    /// Bump the swap counter and post a swap interrupt. Sent by `vd_swap`
+    /// when the guest commits a frame; the worker's [`GpuSystem::notify_xe_swap`]
+    /// updates `swaps_seen`/`last_swap` and pushes an `InterruptSource::Swap`
+    /// onto the (M1.6) `int_tx` channel. Fire-and-forget — no reply.
+    NotifyXeSwap {
+        frontbuffer_phys: u32,
+        width: u32,
+        height: u32,
+    },
+    /// Tear-down signal. The worker drains any in-flight reply channels,
+    /// drops its `GpuSystem`, and the host thread joins.
+    Shutdown,
+}
+
+/// CPU-side proxy that the kernel and the interpreter loop hold. It owns:
+///
+/// - The send end of the GPU command channel (`cmd_tx`).
+/// - The receive end of the GPU→CPU interrupt channel (`int_rx`). Step 6
+///   migrates `kernel.gpu.pending_interrupts` onto this channel.
+/// - Cloned `Arc<AtomicU32>` MMIO mailboxes so MMIO write callbacks (which
+///   already capture these `Arc`s) can keep working unchanged after the
+///   `GpuSystem` itself moves to the worker thread.
+///
+/// `GpuHandle` is `Send + Sync` by virtue of every field being so already
+/// (`Sender`/`Receiver` from crossbeam are `Send + Sync` for `Send` payloads;
+/// `Arc<AtomicU32>` is `Send + Sync`). No explicit impl is needed.
+///
+/// Use [`Self::send_cmd`] to post commands so the parker wake invariant is
+/// preserved (see M1.7): every channel send must be paired with a wake of
+/// the worker thread, otherwise the worker can sleep through new commands
+/// even if `cmd_rx.is_empty()` was momentarily false.
+#[derive(Debug, Clone)]
+pub struct GpuHandle {
+    /// Control-plane sender. `clone()`-able so multiple call sites can post
+    /// commands; crossbeam's `Sender` is `Send + Sync + Clone`.
+    pub cmd_tx: Sender<GpuCommand>,
+    /// Interrupt drain channel. Step 6 moves `pending_interrupts` onto this.
+    /// Until then the `int_tx` half on the worker side is unused.
+    pub int_rx: Receiver<PendingInterrupt>,
+    /// Direct access to the MMIO mailbox arcs. The CP_RB_WPTR write that
+    /// the guest does inside the MMIO region callback already lands in
+    /// `mmio.cp_rb_wptr` (an `Arc<AtomicU32>`); cloning these here lets
+    /// CPU-side code do the same atomic read/write that the inline path
+    /// did, without channel hops.
+    pub mmio: GpuMmio,
+    /// Read-side snapshot of GPU stats / cache sizes. Refreshed by the
+    /// worker each outer loop iteration. Read by [`GpuBackend::digest_snapshot`]
+    /// and the HUD; cheap copy-out under a brief lock acquisition.
+    pub digest: Arc<std::sync::Mutex<GpuDigestSnapshot>>,
+    /// Shared shutdown flag — set by the CPU side during teardown, read
+    /// by the worker each loop iteration. Cloned here so callers without
+    /// access to the worker side (e.g. drop guards) can still signal exit.
+    pub shutdown: Arc<AtomicBool>,
+}
+
+impl GpuHandle {
+    /// Post a command to the worker and wake it. Wraps the raw
+    /// `cmd_tx.send` with the M1.7 parker discipline: set
+    /// `wake_pending=true` (Release) and `unpark()` the worker thread.
+    /// Without the wake, channel sends would not surface to a parked
+    /// worker — `crossbeam_channel::Sender::send` doesn't unpark by
+    /// itself.
+    pub fn send_cmd(
+        &self,
+        cmd: GpuCommand,
+    ) -> Result<(), crossbeam_channel::SendError<GpuCommand>> {
+        let r = self.cmd_tx.send(cmd);
+        if r.is_ok() {
+            self.mmio.wake_pending.store(true, Ordering::Release);
+            if let Ok(g) = self.mmio.worker_thread.lock() {
+                if let Some(t) = g.as_ref() {
+                    t.unpark();
+                }
+            }
+        }
+        r
+    }
+}
+
+/// Periodically-refreshed read-side snapshot of GPU state. Updated by the
+/// worker thread on each outer loop iteration; consumed by the CPU side
+/// for the run-digest at end-of-run, the HUD, etc. Held in an
+/// `Arc<Mutex<GpuDigestSnapshot>>` shared between worker and handle.
+///
+/// Snapshot is intentionally restricted to `u64` counters — they're
+/// cheap to copy and survive past the worker's lifetime (so the digest
+/// can be computed even after the worker has shut down). For the
+/// inline backend [`GpuBackend::digest_snapshot`] computes the same view
+/// directly from the live `GpuSystem` without any locking.
+#[derive(Debug, Clone, Default)]
+pub struct GpuDigestSnapshot {
+    pub stats: GpuStats,
+    pub shader_blobs_live: u64,
+    pub texture_cache_entries: u64,
+    pub texture_decodes: u64,
+}
+
+/// Reverse end of the command and interrupt channels. The GPU thread
+/// (`GpuWorker::run`) reads from `cmd_rx`, pushes onto `int_tx` (M1.6+),
+/// owns the actual `GpuSystem`, and refreshes the shared
+/// `GpuDigestSnapshot`. Built by [`GpuSystem::into_handle`] alongside its
+/// matching [`GpuHandle`].
+///
+/// (No `#[derive(Debug)]` because `GpuSystem` itself isn't `Debug`; we
+/// don't need it on the worker for any production purpose.)
+pub struct GpuWorker {
+    /// The GPU subsystem itself. The worker thread is its exclusive
+    /// owner once spawned.
+    pub system: GpuSystem,
+    /// Receive end of the control channel.
+    pub cmd_rx: Receiver<GpuCommand>,
+    /// Send end of the interrupt channel. M1.6 wires this in.
+    pub int_tx: Sender<PendingInterrupt>,
+    /// Shared digest snapshot, refreshed each outer loop iteration.
+    pub digest: Arc<std::sync::Mutex<GpuDigestSnapshot>>,
+    /// Shutdown flag. Set by `shutdown_and_join_with_timeout`; the worker
+    /// loop checks `Acquire` each iteration.
+    pub shutdown: Arc<AtomicBool>,
+}
+
+impl GpuSystem {
+    /// Split a freshly-built `GpuSystem` into a `(GpuWorker, GpuHandle)`
+    /// pair. The handle keeps cloned `Arc<AtomicU32>` MMIO mailboxes plus
+    /// the channel sender; the worker keeps the system itself plus the
+    /// channel receiver and the interrupt sender.
+    ///
+    /// Channels are unbounded (`crossbeam_channel::unbounded`) because the
+    /// CPU side never blocks on a control-plane send — guest-driven export
+    /// rates are bounded by the interpreter throughput, and interrupts are
+    /// already coalesced upstream by the kernel.
+    ///
+    /// Caller supplies a shared `shutdown: Arc<AtomicBool>` so the worker
+    /// and the CPU side can coordinate teardown. For unit tests that don't
+    /// care about lifecycle, [`Self::into_handle_test`] supplies a fresh
+    /// flag.
+    pub fn into_handle_with_shutdown(
+        self,
+        shutdown: Arc<AtomicBool>,
+    ) -> (GpuWorker, GpuHandle) {
+        let mmio = self.mmio.clone();
+        let (cmd_tx, cmd_rx) = unbounded::<GpuCommand>();
+        let (int_tx, int_rx) = unbounded::<PendingInterrupt>();
+        let digest = Arc::new(std::sync::Mutex::new(GpuDigestSnapshot::default()));
+        let worker = GpuWorker {
+            system: self,
+            cmd_rx,
+            int_tx,
+            digest: digest.clone(),
+            shutdown: shutdown.clone(),
+        };
+        let handle = GpuHandle {
+            cmd_tx,
+            int_rx,
+            mmio,
+            digest,
+            shutdown,
+        };
+        (worker, handle)
+    }
+
+    /// Convenience for tests: allocate a fresh shutdown flag and split.
+    pub fn into_handle(self) -> (GpuWorker, GpuHandle) {
+        self.into_handle_with_shutdown(Arc::new(AtomicBool::new(false)))
+    }
+}
+
+/// Polling interval for the no-op worker's shutdown check. A short sleep
+/// avoids burning a host core while still keeping shutdown latency under
+/// 10 ms, well below the 1 s defensive timeout in
+/// [`shutdown_and_join_with_timeout`].
+const NOOP_WORKER_POLL: Duration = Duration::from_millis(2);
+
+/// Maximum time the worker waits in `park_timeout` before re-checking
+/// shutdown / commands / ring state. With `unpark()` on every guest WPTR
+/// write the typical wake latency is microseconds; this is the upper
+/// bound for the shutdown / quiescent-state polling cadence. 16 ms aligns
+/// with vsync cadence on a 60 Hz host and bounds shutdown latency at the
+/// same value.
+const WORKER_PARK_TIMEOUT: Duration = Duration::from_millis(16);
+
+/// Cap on packets executed per outer-loop iteration before the worker
+/// re-checks shutdown / commands / digest publish. Mirrors the inline-mode
+/// `gpu_runs = max(1, min(64, executed_this_round / 6))` pacer ceiling.
+const WORKER_PACKETS_PER_ITER: u32 = 64;
+
+/// Backend for the kernel's `gpu` field. The two variants share a thin
+/// dispatch layer (forwarding methods on this enum) so call sites in
+/// `xenia-kernel` exports stay terse.
+///
+/// - [`GpuBackend::Inline`] keeps the legacy synchronous path: the CPU
+///   thread calls `kernel.gpu.execute_one(mem)` directly each scheduler
+///   round. Selected by `--gpu-inline` (rollback flag) or implied by
+///   `--ui` until the UI worker is migrated.
+/// - [`GpuBackend::Threaded`] (**default at M1.9**) hands `GpuSystem`
+///   ownership to a dedicated host thread; the CPU thread holds a
+///   [`GpuHandle`] proxy and talks to the worker via channels + the
+///   shared MMIO atomics.
+///
+/// `GpuBackend` itself is `Send` (the inline variant carries `GpuSystem`,
+/// which is Send-able as long as nothing inside it is `!Send` — it isn't);
+/// the threaded variant carries a `Send + Sync` handle.
+pub enum GpuBackend {
+    Inline(GpuSystem),
+    Threaded(GpuHandle),
+}
+
+impl GpuBackend {
+    /// Read the MMIO mailbox struct (cheap — `GpuMmio` is `Clone` cloning
+    /// only `Arc<AtomicU32>`s; we hand back a borrow). The result is the
+    /// same `Arc<AtomicU32>` set on either backend, so MMIO region
+    /// callbacks installed via [`crate::build_mmio_region`] route guest
+    /// writes to the same atomics the worker reads.
+    pub fn mmio(&self) -> &GpuMmio {
+        match self {
+            GpuBackend::Inline(s) => &s.mmio,
+            GpuBackend::Threaded(h) => &h.mmio,
+        }
+    }
+
+    /// Convenience: borrow the inline `GpuSystem` for code paths that
+    /// haven't been generalized to the `Threaded` variant yet (vd_swap's
+    /// drain, the various `state.gpu.X` reads in M1.5+ work). Returns
+    /// `None` in threaded mode; the caller's responsibility is to handle
+    /// that gracefully — typically by treating the operation as a no-op
+    /// or routing it through a command (see `vd_swap` notes).
+    pub fn as_inline(&self) -> Option<&GpuSystem> {
+        match self {
+            GpuBackend::Inline(s) => Some(s),
+            GpuBackend::Threaded(_) => None,
+        }
+    }
+
+    /// Mutable counterpart of [`Self::as_inline`].
+    pub fn as_inline_mut(&mut self) -> Option<&mut GpuSystem> {
+        match self {
+            GpuBackend::Inline(s) => Some(s),
+            GpuBackend::Threaded(_) => None,
+        }
+    }
+
+    /// Forward `VdInitializeRingBuffer`. Inline mode applies it directly;
+    /// threaded mode posts an `InitializeRing` command onto the worker
+    /// channel.
+    pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
+        match self {
+            GpuBackend::Inline(s) => s.initialize_ring_buffer(base, size_log2),
+            GpuBackend::Threaded(h) => {
+                let _ = h.send_cmd(GpuCommand::InitializeRing { base, size_log2 });
+            }
+        }
+    }
+
+    /// Forward `VdEnableRingBufferRPtrWriteBack`.
+    pub fn enable_rptr_writeback(&mut self, addr: u32, block_log2: u32) {
+        match self {
+            GpuBackend::Inline(s) => s.enable_rptr_writeback(addr, block_log2),
+            GpuBackend::Threaded(h) => {
+                let _ = h.send_cmd(GpuCommand::EnableRptrWriteback {
+                    addr,
+                    block_size_log2: block_log2,
+                });
+            }
+        }
+    }
+
+    /// Bump `CP_RB_WPTR` by `dwords`. Both backends route the bump through
+    /// the shared MMIO atomic mailbox (`Acquire`-load → wrap-add →
+    /// `Release`-store). Inline mode then picks up the new value on its
+    /// next `sync_with_mmio`; threaded mode's worker observes the same
+    /// atomic and folds it into its ring view.
+    ///
+    /// Note: the value stored is unmodulo'd. The reading side's
+    /// `sync_with_mmio` does the `% ring.size_dwords` step before
+    /// updating the local ring view, which is the only place a `size_dwords`
+    /// reference exists. Practical wptr drift before u32 wraps is
+    /// `2^32 / 64 ≈ 67M` VdSwap-style bumps — safely above any plausible
+    /// single-run total.
+    pub fn extend_write_ptr_by(&mut self, dwords: u32) {
+        let mmio = self.mmio();
+        // Relaxed is sufficient for the load — we re-store with Release
+        // and the readers (worker `sync_with_mmio` / inline next round)
+        // do their own Acquire. The load here is just a value source.
+        let cur = mmio.cp_rb_wptr.load(Ordering::Relaxed);
+        mmio.cp_rb_wptr
+            .store(cur.wrapping_add(dwords), Ordering::Release);
+    }
+
+    /// Drain any PM4 packets currently exposed by the ring (i.e., up to
+    /// the current `CP_RB_WPTR`). Inline mode runs the synchronous
+    /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks
+    /// on the reply channel up to a 1 s defensive timeout — the worker
+    /// has its own ~900 ms internal deadline so the reply is bounded.
+    ///
+    /// The CPU thread blocking here is sound: the only thread that
+    /// satisfies the reply is the GPU worker, which never tries to
+    /// acquire any CPU-side primitive (it talks back exclusively through
+    /// channels and atomics). The lock-ordering argument from the M1.4
+    /// plan holds: T_cpu → cmd_tx → T_gpu → reply_tx → T_cpu, no cycle.
+    pub fn drain_to_current_wptr(&mut self, mem: &dyn xenia_memory::MemoryAccess) -> u32 {
+        match self {
+            GpuBackend::Inline(s) => {
+                s.sync_with_mmio();
+                s.drain(mem, 4096)
+            }
+            GpuBackend::Threaded(h) => {
+                let target_wptr = h.mmio.cp_rb_wptr.load(Ordering::Acquire);
+                let (reply_tx, reply_rx) = bounded::<()>(1);
+                if h
+                    .send_cmd(GpuCommand::DrainFence {
+                        target_wptr,
+                        reply_tx,
+                    })
+                    .is_err()
+                {
+                    // Worker disconnected; treat as drained.
+                    return 0;
+                }
+                match reply_rx.recv_timeout(Duration::from_secs(1)) {
+                    Ok(()) => {
+                        // We don't currently track the exact packet count
+                        // drained on the threaded path — the worker drains
+                        // by `is_ready` predicate. Return 1 as a "drain
+                        // happened" sentinel; the inline mode's exact
+                        // count is a debug-trace nicety.
+                        1
+                    }
+                    Err(_) => {
+                        tracing::warn!(
+                            target: "gpu",
+                            target_wptr,
+                            "vd_swap drain fence timed out at 1s; continuing teardown",
+                        );
+                        0
+                    }
+                }
+            }
+        }
+    }
+
+    /// Bump `swaps_seen` + record `last_swap` + push a swap interrupt.
+    /// Inline calls directly. Threaded sends `NotifyXeSwap` over the
+    /// command channel — fire-and-forget; the worker handles it on its
+    /// next loop iteration.
+    pub fn notify_xe_swap(&mut self, frontbuffer_phys: u32, width: u32, height: u32) {
+        match self {
+            GpuBackend::Inline(s) => s.notify_xe_swap(frontbuffer_phys, width, height),
+            GpuBackend::Threaded(h) => {
+                let _ = h.send_cmd(GpuCommand::NotifyXeSwap {
+                    frontbuffer_phys,
+                    width,
+                    height,
+                });
+            }
+        }
+    }
+
+    /// Forward [`GpuSystem::has_pending_interrupts`] under inline mode;
+    /// under threaded mode peek the `int_rx` channel.
+    pub fn has_pending_interrupts(&self) -> bool {
+        match self {
+            GpuBackend::Inline(s) => s.has_pending_interrupts(),
+            GpuBackend::Threaded(h) => !h.int_rx.is_empty(),
+        }
+    }
+
+    /// Drain pending interrupts. Inline path forwards to
+    /// [`GpuSystem::take_pending_interrupts`]; threaded path drains the
+    /// channel non-blockingly. (M1 step 6 fully wires the threaded path
+    /// when the worker starts pushing onto `int_tx`; for now, the channel
+    /// is empty in threaded mode at -n 2M, so this still returns an empty
+    /// `Vec`.)
+    pub fn take_pending_interrupts(&mut self) -> Vec<PendingInterrupt> {
+        match self {
+            GpuBackend::Inline(s) => s.take_pending_interrupts(),
+            GpuBackend::Threaded(h) => {
+                let mut out = Vec::new();
+                while let Ok(pi) = h.int_rx.try_recv() {
+                    out.push(pi);
+                }
+                out
+            }
+        }
+    }
+
+    /// End-of-run snapshot used by the run-digest. Inline mode reads
+    /// directly; threaded mode pulls the latest published mirror under a
+    /// brief lock. Returns owned data — safe to use after the worker has
+    /// shut down.
+    pub fn digest_snapshot(&self) -> GpuDigestSnapshot {
+        match self {
+            GpuBackend::Inline(s) => GpuDigestSnapshot {
+                stats: s.stats.clone(),
+                shader_blobs_live: s.shader_blobs.len() as u64,
+                texture_cache_entries: s.texture_cache.len() as u64,
+                texture_decodes: s.texture_cache.decodes_total,
+            },
+            GpuBackend::Threaded(h) => h
+                .digest
+                .lock()
+                .expect("GpuDigestSnapshot mutex poisoned")
+                .clone(),
+        }
+    }
+}
+
+impl GpuWorker {
+    /// Run loop body for the GPU host thread.
+    ///
+    /// Each iteration:
+    ///   1. Check the `Acquire`-loaded shutdown flag — exit if set.
+    ///   2. Drain any pending control-plane commands non-blockingly.
+    ///   3. Sample MMIO (refreshes WPTR / RPTR mailboxes into the live ring).
+    ///   4. Execute up to [`WORKER_PACKETS_PER_ITER`] PM4 packets while the
+    ///      ring is non-empty / not blocked.
+    ///   5. Refresh the shared digest snapshot under a brief lock.
+    ///   6. If no work was done this iteration, sleep
+    ///      [`WORKER_IDLE_SLEEP`]. Step 7 swaps this for `park_timeout`.
+    ///
+    /// `memory: Arc<GuestMemory>` is shared with the CPU thread. The
+    /// worker only ever reads through `&*memory`, which deref-coerces to
+    /// `&GuestMemory` and then to `&dyn MemoryAccess`. All mutations on
+    /// `MemoryAccess` are `&self` post-trait-flip, so concurrent CPU and
+    /// GPU writes are sound under the trait's contract (callers must not
+    /// concurrently read/write the same byte range from different
+    /// threads — vd_swap's RPTR writeback / EVENT_WRITE_SHD writes target
+    /// guest-thread-private addresses by construction).
+    pub fn run(mut self, memory: Arc<GuestMemory>) {
+        // M1.7 parker registration: publish our `Thread` handle so the
+        // MMIO `CP_RB_WPTR` write callback can `unpark()` us. Only one
+        // worker thread per `GpuMmio`; we replace whatever was there.
+        if let Ok(mut g) = self.system.mmio.worker_thread.lock() {
+            *g = Some(thread::current());
+        }
+        loop {
+            // (1) shutdown
+            if self.shutdown.load(Ordering::Acquire) {
+                break;
+            }
+            // (2) drain commands
+            let mut did_work = false;
+            while let Ok(cmd) = self.cmd_rx.try_recv() {
+                did_work = true;
+                match cmd {
+                    GpuCommand::InitializeRing { base, size_log2 } => {
+                        self.system.initialize_ring_buffer(base, size_log2);
+                    }
+                    GpuCommand::EnableRptrWriteback {
+                        addr,
+                        block_size_log2,
+                    } => {
+                        self.system.enable_rptr_writeback(addr, block_size_log2);
+                    }
+                    GpuCommand::DrainFence {
+                        target_wptr: _,
+                        reply_tx,
+                    } => {
+                        // Drain the ring up to whatever WPTR the MMIO
+                        // atomic currently exposes (the CPU side bumped
+                        // it before sending the fence). Bounded by an
+                        // internal 900 ms deadline — 100 ms tighter than
+                        // the CPU's `recv_timeout(1s)` so the timeout
+                        // surfaces on the CPU side as a clean
+                        // `RecvTimeout` rather than a partial drain that
+                        // looks complete.
+                        //
+                        // The drain loop polls `is_ready` after each
+                        // packet; `sync_with_mmio` between packets is
+                        // what folds late guest WPTR writes into the
+                        // local ring view. Loop exits when the ring is
+                        // empty (rptr == wptr after modulo) or a packet
+                        // returns `Idle`/`Blocked`.
+                        self.system.sync_with_mmio();
+                        let deadline = Instant::now() + Duration::from_millis(900);
+                        while self.system.is_ready(&*memory) {
+                            if Instant::now() >= deadline {
+                                break;
+                            }
+                            match self.system.execute_one(&*memory) {
+                                ExecOutcome::Stepped { .. } => {
+                                    self.system.sync_with_mmio();
+                                }
+                                ExecOutcome::Idle | ExecOutcome::Blocked => break,
+                            }
+                        }
+                        let _ = reply_tx.send(());
+                    }
+                    GpuCommand::NotifyXeSwap {
+                        frontbuffer_phys,
+                        width,
+                        height,
+                    } => {
+                        self.system
+                            .notify_xe_swap(frontbuffer_phys, width, height);
+                    }
+                    GpuCommand::Shutdown => {
+                        self.shutdown.store(true, Ordering::Release);
+                        return;
+                    }
+                }
+            }
+            // (3,4) drive the GPU
+            self.system.sync_with_mmio();
+            let mut budget = WORKER_PACKETS_PER_ITER;
+            while budget > 0 && self.system.is_ready(&*memory) {
+                match self.system.execute_one(&*memory) {
+                    ExecOutcome::Stepped { .. } => {
+                        did_work = true;
+                        self.system.sync_with_mmio();
+                    }
+                    ExecOutcome::Idle | ExecOutcome::Blocked => break,
+                }
+                budget -= 1;
+            }
+            // (5a) M1.6: forward `PM4_INTERRUPT` / `XE_SWAP` events from
+            //      `system.pending_interrupts` onto `int_tx`. The Vec
+            //      lives on this thread; the channel is the cross-thread
+            //      delivery primitive. Send is non-blocking (unbounded)
+            //      and the receive end is drained by the CPU thread's
+            //      per-round queue at `main.rs::run_execution`.
+            //      `int_tx.send` returns `Err` only if the receiver was
+            //      dropped — which means the CPU side is gone, in which
+            //      case we'd be torn down momentarily anyway.
+            for pi in self.system.take_pending_interrupts() {
+                if self.int_tx.send(pi).is_err() {
+                    break;
+                }
+                did_work = true;
+            }
+            // (5b) publish digest snapshot
+            if did_work {
+                let snap = GpuDigestSnapshot {
+                    stats: self.system.stats.clone(),
+                    shader_blobs_live: self.system.shader_blobs.len() as u64,
+                    texture_cache_entries: self.system.texture_cache.len() as u64,
+                    texture_decodes: self.system.texture_cache.decodes_total,
+                };
+                if let Ok(mut g) = self.digest.lock() {
+                    *g = snap;
+                }
+            }
+            // (6) M1.7 parker — `park_timeout` replaces the polling
+            //     sleep. The standard parker idiom defends against the
+            //     producer-races-park lost-wakeup:
+            //
+            //     1. Swap `wake_pending` to false (claim "we're going to
+            //        park"). If `was_pending` is true, a producer
+            //        signaled us between the last work and now — skip
+            //        the park, loop and re-process.
+            //     2. Re-check side conditions (cmd channel, shutdown).
+            //        These may have changed after step 1.
+            //     3. `park_timeout`. If the producer's `unpark()` runs
+            //        between our re-check and our park call, std's token
+            //        records it and the next park returns immediately.
+            //        If neither happens within `WORKER_PARK_TIMEOUT`, we
+            //        wake on our own and re-evaluate.
+            if !did_work {
+                let was_pending =
+                    self.system.mmio.wake_pending.swap(false, Ordering::AcqRel);
+                if !was_pending
+                    && self.cmd_rx.is_empty()
+                    && !self.shutdown.load(Ordering::Acquire)
+                {
+                    thread::park_timeout(WORKER_PARK_TIMEOUT);
+                }
+            }
+        }
+        // Clear the wake target on exit so post-shutdown MMIO writes
+        // don't try to unpark a dead thread (sound — `Thread::unpark`
+        // on an exited thread is a no-op — but it keeps the invariant
+        // tidy).
+        if let Ok(mut g) = self.system.mmio.worker_thread.lock() {
+            *g = None;
+        }
+    }
+}
+
+/// Spawn the real GPU worker thread. Returns its `JoinHandle`; the
+/// matching `GpuHandle` (caller's existing one from
+/// [`GpuSystem::into_handle_with_shutdown`]) is what the CPU thread keeps.
+pub fn spawn_gpu_worker(
+    worker: GpuWorker,
+    memory: Arc<GuestMemory>,
+) -> JoinHandle<()> {
+    thread::Builder::new()
+        .name("xenia-gpu".to_string())
+        .spawn(move || worker.run(memory))
+        .expect("spawn xenia-gpu worker thread")
+}
+
+/// M1 step 3 — spawn a placeholder GPU worker thread that does nothing
+/// except poll `shutdown` on a short cadence and exit cleanly when it sees
+/// `true`. Verifies thread lifecycle, signal propagation, and clean
+/// teardown. **Not used in production paths**: in step 4 the worker grows a
+/// real `GpuWorker::run` body that owns a `GpuSystem`. Until then this
+/// function is the only spawn site, gated behind `--gpu-thread` in the CLI.
+///
+/// The function returns the `JoinHandle` so the caller can block on
+/// teardown via [`shutdown_and_join_with_timeout`]. Ownership of the
+/// `Arc<AtomicBool>` is shared: the caller keeps a clone for signaling,
+/// the thread takes another clone for polling.
+///
+/// Rationale for `Acquire`/`Release` ordering on the bool: the spawning
+/// thread may set up shared state *before* the worker reads it once we
+/// expand the worker in step 4 — the `Release` store on shutdown then
+/// pairs with the `Acquire` load here so any prior writes the spawner did
+/// (e.g. populating channels with farewell messages) are visible to the
+/// worker. For the no-op stage there's no shared state, but using the
+/// stricter ordering now means step 4 inherits a correctly-fenced
+/// shutdown protocol with no further changes.
+pub fn spawn_noop_worker(shutdown: Arc<AtomicBool>) -> JoinHandle<()> {
+    thread::Builder::new()
+        .name("xenia-gpu-noop".to_string())
+        .spawn(move || {
+            while !shutdown.load(Ordering::Acquire) {
+                thread::sleep(NOOP_WORKER_POLL);
+            }
+        })
+        .expect("spawn xenia-gpu-noop worker thread")
+}
+
+/// Signal `shutdown` to the worker and join its thread, with a defensive
+/// timeout so a misbehaving worker can't wedge the entire process. Logs at
+/// `error!` if the timeout fires (which would indicate either the worker
+/// loop ignoring `shutdown` or being parked on a primitive that wasn't
+/// woken — both are bugs the user should hear about).
+///
+/// Returns `Ok(())` on a clean join inside the timeout, `Err(())` on
+/// timeout. The caller decides whether to continue process teardown anyway
+/// (typically yes — the worker's only state is its own stack).
+pub fn shutdown_and_join_with_timeout(
+    shutdown: &Arc<AtomicBool>,
+    handle: JoinHandle<()>,
+    timeout: Duration,
+) -> Result<(), ()> {
+    shutdown.store(true, Ordering::Release);
+    // No `JoinHandle::join_timeout` in std; emulate via a side-channel
+    // signal from the polling. We use a sentinel-clone of the JoinHandle
+    // approach: spawn a watchdog that, after the timeout, sets a
+    // "give up" flag — but `join` is a blocking call we can't preempt.
+    // Instead use a parking helper: spawn a hop-thread to call join, and
+    // wait on it via a `crossbeam_channel::after` select.
+    let (tx, rx) = unbounded::<()>();
+    let join_thread = thread::Builder::new()
+        .name("xenia-gpu-joiner".to_string())
+        .spawn(move || {
+            let _ = handle.join();
+            // `_ = tx.send(())` — receiver may already be dropped if
+            // we timed out, in which case Err is fine.
+            let _ = tx.send(());
+        })
+        .expect("spawn xenia-gpu-joiner thread");
+    crossbeam_channel::select! {
+        recv(rx) -> _ => {
+            // Joiner finished within the budget. Reap it (no work — the
+            // thread already returned). `join` here is fast.
+            let _ = join_thread.join();
+            Ok(())
+        }
+        recv(crossbeam_channel::after(timeout)) -> _ => {
+            tracing::error!(
+                target: "gpu",
+                ?timeout,
+                "GPU worker did not exit in time; leaking thread to avoid wedging shutdown",
+            );
+            // Detach the joiner; it leaks but at least we proceed. Will
+            // get cleaned up when the process exits.
+            std::mem::drop(join_thread);
+            Err(())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Roundtrip an `InitializeRing` command: CPU side sends, worker side
+    /// receives, payload bytes match. This validates the channel plumbing
+    /// without touching any GPU semantics.
+    #[test]
+    fn initialize_ring_roundtrips_through_handle() {
+        let (worker, handle) = GpuSystem::new().into_handle();
+        let GpuWorker { cmd_rx, .. } = worker;
+        handle
+            .cmd_tx
+            .send(GpuCommand::InitializeRing {
+                base: 0x1000_0000,
+                size_log2: 18,
+            })
+            .expect("cmd_tx send");
+        match cmd_rx.recv().expect("cmd_rx recv") {
+            GpuCommand::InitializeRing { base, size_log2 } => {
+                assert_eq!(base, 0x1000_0000);
+                assert_eq!(size_log2, 18);
+            }
+            other => panic!("unexpected cmd: {other:?}"),
+        }
+    }
+
+    /// MMIO atomics on the handle and the worker's GpuSystem must be the
+    /// same Arc (clone). A guest write through the MMIO region callback
+    /// stores into `handle.mmio.cp_rb_wptr`; the worker observes the same
+    /// value via `worker.system.mmio.cp_rb_wptr`. If we accidentally
+    /// allocated a fresh atomic for either side, the worker would never see
+    /// guest writes.
+    #[test]
+    fn mmio_arcs_are_shared_between_handle_and_worker() {
+        use std::sync::atomic::Ordering;
+        let (worker, handle) = GpuSystem::new().into_handle();
+        handle.mmio.cp_rb_wptr.store(0xC0FFEE, Ordering::Release);
+        assert_eq!(
+            worker.system.mmio.cp_rb_wptr.load(Ordering::Acquire),
+            0xC0FFEE,
+            "worker side did not observe handle-side atomic store",
+        );
+    }
+
+    /// `GpuHandle` must be `Send + Sync`. Compile-time assertion via type
+    /// constraint — won't link if the bound is violated.
+    #[test]
+    fn handle_is_send_sync() {
+        fn assert_send_sync<T: Send + Sync>() {}
+        assert_send_sync::<GpuHandle>();
+    }
+
+    /// `GpuWorker` must be `Send` so we can move it onto a thread in step 3.
+    /// (`Sync` is not required — only one thread ever owns the worker.)
+    #[test]
+    fn worker_is_send() {
+        fn assert_send<T: Send>() {}
+        assert_send::<GpuWorker>();
+    }
+
+    /// Dropping the handle (CPU side) must not block recv on the worker
+    /// side; instead it must surface as `Disconnected`. This is the
+    /// standard crossbeam guarantee but we pin it down with an explicit
+    /// test so a future refactor (e.g. switching channel backends) can't
+    /// silently change semantics.
+    #[test]
+    fn dropping_handle_disconnects_command_channel() {
+        let (worker, handle) = GpuSystem::new().into_handle();
+        drop(handle);
+        let result = worker.cmd_rx.recv();
+        assert!(
+            matches!(result, Err(crossbeam_channel::RecvError)),
+            "expected Disconnected after handle drop, got {result:?}",
+        );
+    }
+
+    /// Spawn the no-op worker, signal shutdown immediately, and join. Must
+    /// complete within a generous timeout (the polling cadence is 2 ms,
+    /// so 250 ms gives plenty of headroom even on a loaded test runner).
+    #[test]
+    fn noop_worker_shuts_down_cleanly() {
+        let shutdown = Arc::new(AtomicBool::new(false));
+        let handle = spawn_noop_worker(shutdown.clone());
+        // Brief gap so the worker enters its loop at least once before we
+        // signal — exercises the in-loop exit path rather than the never-
+        // entered-loop case.
+        thread::sleep(Duration::from_millis(5));
+        let outcome =
+            shutdown_and_join_with_timeout(&shutdown, handle, Duration::from_millis(250));
+        assert_eq!(outcome, Ok(()), "no-op worker did not join in budget");
+    }
+
+    /// A worker that never exits must surface as a timeout, not a hang.
+    /// Validates the `crossbeam_channel::after` budget. We construct a
+    /// pseudo-worker that ignores `shutdown` for the entire timeout
+    /// window, then exits — the test budget is short enough to force the
+    /// timeout path.
+    #[test]
+    fn shutdown_join_timeouts_on_misbehaving_worker() {
+        let shutdown = Arc::new(AtomicBool::new(false));
+        let handle = thread::Builder::new()
+            .name("test-misbehaving-worker".to_string())
+            .spawn(|| {
+                // Sleep longer than the test's join budget. The test
+                // proves the join helper returns `Err(())` rather than
+                // blocking the test process indefinitely.
+                thread::sleep(Duration::from_millis(500));
+            })
+            .expect("spawn misbehaving worker");
+        let outcome =
+            shutdown_and_join_with_timeout(&shutdown, handle, Duration::from_millis(50));
+        assert_eq!(outcome, Err(()), "expected timeout signal");
+    }
+
+    /// M1.8 — `write_u32_fence` / `read_u32_fence` ordering test. A
+    /// producer thread writes a "data" value, then a "fence" value via
+    /// `write_u32_fence`. A consumer thread spin-reads the fence via
+    /// `read_u32_fence` and, on observing the producer's update, reads
+    /// the data via plain `read_u32`. The data must always equal the
+    /// producer's pre-fence write — never an older value or a torn read.
+    ///
+    /// On x86_64 (TSO) this would pass even without the fences; on
+    /// weaker architectures it would fail without them. We pin down the
+    /// invariant here so future ports / refactors can't silently weaken
+    /// it. Uses a small synthetic memory implementing `MemoryAccess`.
+    #[test]
+    fn write_u32_fence_publishes_prior_writes() {
+        use std::sync::atomic::{AtomicBool, AtomicU32};
+        use std::time::Instant;
+        use xenia_memory::MemoryAccess;
+
+        // The test's MemoryAccess impl uses `AtomicU32` slots so the
+        // multi-byte u32 reads/writes are torn-read-free. The fence
+        // helper layers Release/Acquire on top — without atomic
+        // storage, byte-by-byte reads on a writer-racing buffer would
+        // see torn values regardless of the fence.
+        const SLOT_COUNT: usize = 16;
+        struct ScopedMem([AtomicU32; SLOT_COUNT]);
+        impl ScopedMem {
+            fn slot(&self, addr: u32) -> &AtomicU32 {
+                &self.0[(addr / 4) as usize]
+            }
+        }
+        impl MemoryAccess for ScopedMem {
+            fn read_u8(&self, addr: u32) -> u8 {
+                let v = self.slot(addr & !3).load(Ordering::Relaxed);
+                let shift = (addr & 3) * 8;
+                (v >> shift) as u8
+            }
+            fn read_u16(&self, addr: u32) -> u16 {
+                u16::from_le_bytes([self.read_u8(addr), self.read_u8(addr + 1)])
+            }
+            fn read_u32(&self, addr: u32) -> u32 {
+                self.slot(addr).load(Ordering::Relaxed)
+            }
+            fn read_u64(&self, addr: u32) -> u64 {
+                let lo = self.read_u32(addr) as u64;
+                let hi = self.read_u32(addr + 4) as u64;
+                lo | (hi << 32)
+            }
+            fn write_u8(&self, _addr: u32, _val: u8) {
+                unimplemented!("test fixture only writes u32")
+            }
+            fn write_u16(&self, _addr: u32, _val: u16) {
+                unimplemented!("test fixture only writes u32")
+            }
+            fn write_u32(&self, addr: u32, val: u32) {
+                self.slot(addr).store(val, Ordering::Relaxed);
+            }
+            fn write_u64(&self, addr: u32, val: u64) {
+                self.write_u32(addr, val as u32);
+                self.write_u32(addr + 4, (val >> 32) as u32);
+            }
+            fn translate(&self, _addr: u32) -> Option<*const u8> {
+                None
+            }
+            fn translate_mut(&self, _addr: u32) -> Option<*mut u8> {
+                None
+            }
+        }
+
+        let mem: Arc<ScopedMem> = Arc::new(ScopedMem(std::array::from_fn(|_| {
+            AtomicU32::new(0)
+        })));
+        // Initialize fence and data slots to zero.
+        mem.write_u32(0, 0); // data
+        mem.write_u32(16, 0); // fence
+
+        let stop = Arc::new(AtomicBool::new(false));
+
+        let mem_p = mem.clone();
+        let stop_p = stop.clone();
+        let producer = thread::Builder::new()
+            .name("fence-producer".into())
+            .spawn(move || {
+                for i in 1u32..=10_000 {
+                    if stop_p.load(Ordering::Relaxed) {
+                        break;
+                    }
+                    mem_p.write_u32(0, i); // data
+                    mem_p.write_u32_fence(16, i); // fence (Release)
+                    thread::yield_now();
+                }
+            })
+            .expect("spawn producer");
+
+        let mem_c = mem.clone();
+        let consumer = thread::Builder::new()
+            .name("fence-consumer".into())
+            .spawn(move || {
+                let deadline = Instant::now() + Duration::from_millis(500);
+                let mut last_seen = 0u32;
+                let mut iters = 0u32;
+                while Instant::now() < deadline {
+                    let f = mem_c.read_u32_fence(16); // Acquire
+                    if f != last_seen {
+                        let d = mem_c.read_u32(0);
+                        // The data we read after the fence must be at
+                        // least as new as the fence value (producer
+                        // wrote `data = i; fence(i)` in that order).
+                        assert!(
+                            d >= f,
+                            "fence ordering violated: data={d} fence={f}"
+                        );
+                        last_seen = f;
+                        iters += 1;
+                    }
+                }
+                iters
+            })
+            .expect("spawn consumer");
+
+        let observed = consumer.join().expect("consumer join");
+        stop.store(true, Ordering::Relaxed);
+        let _ = producer.join();
+        assert!(
+            observed > 0,
+            "consumer never observed a fence transition (race scheduler too unfair?)",
+        );
+    }
+
+    /// Spawning two no-op workers in parallel and joining both must
+    /// succeed without interference — proves the joiner side-thread
+    /// pattern doesn't accidentally serialize teardown.
+    #[test]
+    fn two_concurrent_noop_workers_both_shut_down() {
+        let shutdown_a = Arc::new(AtomicBool::new(false));
+        let handle_a = spawn_noop_worker(shutdown_a.clone());
+        let shutdown_b = Arc::new(AtomicBool::new(false));
+        let handle_b = spawn_noop_worker(shutdown_b.clone());
+        let r_a = shutdown_and_join_with_timeout(
+            &shutdown_a,
+            handle_a,
+            Duration::from_millis(250),
+        );
+        let r_b = shutdown_and_join_with_timeout(
+            &shutdown_b,
+            handle_b,
+            Duration::from_millis(250),
+        );
+        assert_eq!(r_a, Ok(()));
+        assert_eq!(r_b, Ok(()));
+    }
+}
diff --git a/crates/xenia-gpu/src/lib.rs b/crates/xenia-gpu/src/lib.rs
index 8adce9c..74f906f 100644
--- a/crates/xenia-gpu/src/lib.rs
+++ b/crates/xenia-gpu/src/lib.rs
@@ -1,21 +1,49 @@
+//! Xenos GPU emulation for xenia-rs.
+//!
+//! Modules:
+//! - [`pm4`]: packet format decoder + Type-3 opcode set.
+//! - [`ring_view`]: ring-buffer bookkeeping (base/size/read/write pointers).
+//! - [`register_file`]: 0x6000-entry register array backing the CP + state.
+//! - [`gpu_system`]: top-level `GpuSystem` + PM4 executor running one packet
+//!   per call (see the plan's P2 for the design rationale).
+//!
+//! Legacy module `ring_drain` and `command_processor` are retained while P3+
+//! migrations finish; they will be removed once every caller is on
+//! [`gpu_system::GpuSystem`].
+
 pub mod command_processor;
+pub mod draw_state;
+pub mod edram;
+pub mod gpu_system;
+pub mod handle;
+pub mod mmio_region;
+pub mod pm4;
+pub mod primitive;
 pub mod register_file;
+pub mod ring_drain;
+pub mod ring_view;
+pub mod render_target_cache;
+pub mod resolve;
+pub mod shader_metrics;
+pub mod shaders;
+pub mod texture_cache;
+pub mod tiled_address;
+pub mod translator;
+pub mod ucode;
+pub mod xenos_constants;
 
-/// Stub GPU system for initial implementation.
-pub struct GpuSystem {
-    pub register_file: register_file::RegisterFile,
-}
-
-impl GpuSystem {
-    pub fn new() -> Self {
-        Self {
-            register_file: register_file::RegisterFile::new(),
-        }
-    }
-}
-
-impl Default for GpuSystem {
-    fn default() -> Self {
-        Self::new()
-    }
-}
+pub use gpu_system::{
+    ExecOutcome, GpuBlock, GpuMmio, GpuStats, GpuSystem, InterruptSource, PendingInterrupt,
+    ShaderBlob, SwapNotification, WaitCmp,
+};
+pub use handle::{
+    DrainReply, GpuBackend, GpuCommand, GpuDigestSnapshot, GpuHandle, GpuWorker,
+    shutdown_and_join_with_timeout, spawn_gpu_worker, spawn_noop_worker,
+};
+pub use mmio_region::build_region as build_mmio_region;
+pub use pm4::{
+    PacketHeader, PacketKind, PM4_INTERRUPT, PM4_NOP, PM4_XE_SWAP, SWAP_SIGNATURE,
+    type3_opcode_name,
+};
+pub use ring_drain::{DrainResult, drain};
+pub use ring_view::RingBufferView;
diff --git a/crates/xenia-gpu/src/mmio_region.rs b/crates/xenia-gpu/src/mmio_region.rs
new file mode 100644
index 0000000..fe32c62
--- /dev/null
+++ b/crates/xenia-gpu/src/mmio_region.rs
@@ -0,0 +1,217 @@
+//! Construct an `xenia_memory::MmioRegion` that backs the Xenos GPU register
+//! aperture at guest physical `0x7FC80000` (per canary
+//! `graphics_system.cc:141-144` — `memory_->AddVirtualMappedRange(0x7FC80000,
+//! 0xFFFF0000, 0x0000FFFF, …)`).
+//!
+//! Only a handful of registers need a round-trip over the bus; everything
+//! else (the ALU / fetch constants, the RBBM state machine, …) lives inside
+//! `GpuSystem::register_file` and is driven by PM4 packets from the CP on
+//! the same host thread.
+//!
+//! The read/write closures capture `Arc<AtomicU32>` mailboxes cloned from
+//! [`crate::GpuMmio`]; [`crate::GpuSystem::sync_with_mmio`] samples them
+//! each scheduler round.
+
+use std::sync::atomic::Ordering;
+
+use xenia_memory::MmioRegion;
+
+use crate::gpu_system::{reg, GpuMmio};
+
+/// Xenos GPU register aperture base (guest physical address). Matches
+/// canary's `graphics_system.cc:141`.
+pub const APERTURE_BASE: u32 = 0x7FC8_0000;
+/// Mask used by `MmioRegion::contains` so any `0x7FC8xxxx` address hits.
+pub const APERTURE_MASK: u32 = 0xFFFF_0000;
+/// Total aperture size in bytes (enough for the low 16-bit register window).
+pub const APERTURE_SIZE: u32 = 0x0001_0000;
+
+/// Build the [`MmioRegion`] to install on the guest memory.
+pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
+    let read_wptr = mmio.cp_rb_wptr.clone();
+    let read_rptr = mmio.cp_rb_rptr.clone();
+    let read_int_status = mmio.cp_int_status.clone();
+    let read_int_ack = mmio.cp_int_ack.clone();
+    let read_vblank_status = mmio.d1mode_vblank_vline_status.clone();
+    let write_wptr = mmio.cp_rb_wptr.clone();
+    let write_int_ack = mmio.cp_int_ack.clone();
+    let write_vblank_status = mmio.d1mode_vblank_vline_status.clone();
+    // M1.7 parker — captured into the WPTR write closure to wake a
+    // parked GPU worker on every guest WPTR write. In inline mode the
+    // mutex holds `None`, so the unpark site is a brief lock + no-op.
+    let wake_pending = mmio.wake_pending.clone();
+    let worker_thread = mmio.worker_thread.clone();
+
+    MmioRegion {
+        base_address: APERTURE_BASE,
+        mask: APERTURE_MASK,
+        size: APERTURE_SIZE,
+        read_callback: Box::new(move |addr: u32| {
+            let reg_index = (addr & 0xFFFF) / 4;
+            match reg_index {
+                reg::CP_RB_WPTR => read_wptr.load(Ordering::Relaxed),
+                reg::CP_RB_RPTR => read_rptr.load(Ordering::Relaxed),
+                reg::CP_INT_STATUS => read_int_status.load(Ordering::Relaxed),
+                // Games sometimes read-back the ack register to check interrupt ownership
+                // — serve the last-written value.
+                reg::CP_INT_ACK => read_int_ack.load(Ordering::Relaxed),
+                reg::D1MODE_VBLANK_VLINE_STATUS => {
+                    read_vblank_status.load(Ordering::Relaxed)
+                }
+                _ => {
+                    tracing::trace!(
+                        reg = format_args!("{reg_index:#x}"),
+                        addr = format_args!("{addr:#010x}"),
+                        "gpu mmio: unmapped read (returning 0)"
+                    );
+                    0
+                }
+            }
+        }),
+        write_callback: Box::new(move |addr: u32, value: u32| {
+            let reg_index = (addr & 0xFFFF) / 4;
+            match reg_index {
+                reg::CP_RB_WPTR => {
+                    // Release: any prior writes to ring memory the guest
+                    // performed before bumping WPTR must be visible to
+                    // the GPU consumer that Acquire-loads this atomic.
+                    write_wptr.store(value, Ordering::Release);
+                    // M1.7 parker wake: set the pending bit (Release) so
+                    // a worker swapping it on its way to `park_timeout`
+                    // sees `was_pending == true` and skips the park; AND
+                    // unpark the worker if it's already parked. Both are
+                    // necessary to defend against the race window between
+                    // the worker's `swap(false)` and `park_timeout()`.
+                    wake_pending.store(true, Ordering::Release);
+                    if let Ok(g) = worker_thread.lock() {
+                        if let Some(t) = g.as_ref() {
+                            t.unpark();
+                        }
+                    }
+                    tracing::trace!(
+                        value,
+                        addr = format_args!("{addr:#010x}"),
+                        "gpu mmio: CP_RB_WPTR write"
+                    );
+                }
+                // CP_INT_ACK clears interrupt bits; we just echo the value.
+                reg::CP_INT_ACK => {
+                    write_int_ack.store(value, Ordering::Relaxed);
+                }
+                // D1MODE_VBLANK_VLINE_STATUS is write-1-to-clear per the
+                // AMD M56 display-controller ref. Clear any bit the guest
+                // writes a 1 to (leaving other bits untouched).
+                reg::D1MODE_VBLANK_VLINE_STATUS => {
+                    let prev = write_vblank_status.load(Ordering::Relaxed);
+                    write_vblank_status.store(prev & !value, Ordering::Relaxed);
+                }
+                _ => {
+                    tracing::trace!(
+                        reg = format_args!("{reg_index:#x}"),
+                        addr = format_args!("{addr:#010x}"),
+                        value = format_args!("{value:#x}"),
+                        "gpu mmio: unmapped write (dropping)"
+                    );
+                }
+            }
+        }),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn build() -> (GpuMmio, MmioRegion) {
+        let mmio = GpuMmio::new();
+        let region = build_region(&mmio);
+        (mmio, region)
+    }
+
+    /// `D1MODE_VBLANK_VLINE_STATUS` read must surface the atomic's current
+    /// value — Sylpheed's graphics-interrupt callback reads bit 0 to decide
+    /// whether vblank actually fired; if we always return 0 the callback
+    /// silently skips every frame's work.
+    #[test]
+    fn vblank_status_read_returns_stored_value() {
+        let (mmio, region) = build();
+        mmio.d1mode_vblank_vline_status
+            .store(0x1, Ordering::Relaxed);
+        let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
+        assert_eq!((region.read_callback)(offset), 0x1);
+    }
+
+    /// Guest clears the flag by writing 1 back. Classic write-1-to-clear —
+    /// AMD M56 display-controller ref and Canary's behavior. We preserve
+    /// unrelated bits so higher-bit status (VLINE_INT_OCCURRED etc.) can
+    /// coexist with a concurrent clear of bit 0.
+    #[test]
+    fn vblank_status_write_1_to_clear() {
+        let (mmio, region) = build();
+        mmio.d1mode_vblank_vline_status
+            .store(0b11, Ordering::Relaxed);
+        let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
+        (region.write_callback)(offset, 0b01);
+        assert_eq!(
+            mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
+            0b10,
+            "bit 0 cleared, bit 1 preserved"
+        );
+    }
+
+    /// Write-0-to-a-bit must NOT clear that bit — classic W1TC semantics.
+    #[test]
+    fn vblank_status_write_0_is_noop() {
+        let (mmio, region) = build();
+        mmio.d1mode_vblank_vline_status
+            .store(0b11, Ordering::Relaxed);
+        let offset = APERTURE_BASE + reg::D1MODE_VBLANK_VLINE_STATUS * 4;
+        (region.write_callback)(offset, 0x0);
+        assert_eq!(
+            mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed),
+            0b11
+        );
+    }
+
+    /// Regression: prior to the fix, `reg::CP_RB_WPTR` held a byte offset
+    /// (`0x0714`) while the match arm compared against a *register index*
+    /// (`(addr & 0xFFFF) / 4 == 0x01C5`). Guest MMIO writes to the WPTR
+    /// therefore fell through to "unmapped" and the atomic never moved;
+    /// only `VdInitializeRingBuffer` / `extend_write_ptr` paths worked.
+    ///
+    /// Verify every CP register lands in its atomic when the guest writes
+    /// at the canonical `APERTURE_BASE + index*4` byte address.
+    #[test]
+    fn cp_rb_wptr_write_via_mmio_bus_reaches_atomic() {
+        let (mmio, region) = build();
+        let offset = APERTURE_BASE + reg::CP_RB_WPTR * 4;
+        assert_eq!(offset, 0x7FC8_0714, "byte offset must match Canary CP_RB_WPTR");
+        (region.write_callback)(offset, 0x1234_5678);
+        assert_eq!(mmio.cp_rb_wptr.load(Ordering::Relaxed), 0x1234_5678);
+    }
+
+    #[test]
+    fn cp_int_ack_write_via_mmio_bus_reaches_atomic() {
+        let (mmio, region) = build();
+        let offset = APERTURE_BASE + reg::CP_INT_ACK * 4;
+        assert_eq!(offset, 0x7FC8_07D0, "byte offset must match Canary CP_INT_ACK");
+        (region.write_callback)(offset, 0xDEAD_BEEF);
+        assert_eq!(mmio.cp_int_ack.load(Ordering::Relaxed), 0xDEAD_BEEF);
+    }
+
+    #[test]
+    fn cp_rb_rptr_read_via_mmio_bus_returns_atomic() {
+        let (mmio, region) = build();
+        mmio.cp_rb_rptr.store(0xCAFE_F00D, Ordering::Relaxed);
+        let offset = APERTURE_BASE + reg::CP_RB_RPTR * 4;
+        assert_eq!((region.read_callback)(offset), 0xCAFE_F00D);
+    }
+
+    #[test]
+    fn cp_int_status_read_via_mmio_bus_returns_atomic() {
+        let (mmio, region) = build();
+        mmio.cp_int_status.store(0x0000_0001, Ordering::Relaxed);
+        let offset = APERTURE_BASE + reg::CP_INT_STATUS * 4;
+        assert_eq!((region.read_callback)(offset), 0x0000_0001);
+    }
+}
diff --git a/crates/xenia-gpu/src/pm4.rs b/crates/xenia-gpu/src/pm4.rs
new file mode 100644
index 0000000..370b0ba
--- /dev/null
+++ b/crates/xenia-gpu/src/pm4.rs
@@ -0,0 +1,232 @@
+//! PM4 packet format — header decoding + Type-3 opcode set.
+//!
+//! Xenos PM4 packet layout mirrors `xenia-canary/src/xenia/gpu/packet_disassembler.cc`:
+//!
+//! - **Type 0** (`packet >> 30 == 0`): register-write run.
+//!   `count = ((packet >> 16) & 0x3FFF) + 1`. Total dwords = `1 + count`.
+//!   With `(packet >> 15) & 1 == 1`, all writes target the same register.
+//! - **Type 1** (`packet >> 30 == 1`): two-register write. Total dwords = 3.
+//! - **Type 2** (`packet >> 30 == 2`): NOP — a single skipped dword.
+//! - **Type 3** (`packet >> 30 == 3`): command.
+//!   `opcode = (packet >> 8) & 0x7F`,
+//!   `count  = ((packet >> 16) & 0x3FFF) + 1`.
+//!   Total dwords = `1 + count`.
+
+/// The cookie canary writes alongside `PM4_XE_SWAP` so tooling can recognize
+/// swap packets. `'X','E','N','X'` big-endian (`kSwapSignature`).
+pub const SWAP_SIGNATURE: u32 = 0x584E_4558;
+
+// ── Named Type-3 opcodes (from xenia-canary/src/xenia/gpu/xenos.h:1617-1679) ──
+
+pub const PM4_ME_INIT: u8 = 0x48;
+pub const PM4_NOP: u8 = 0x10;
+pub const PM4_INDIRECT_BUFFER: u8 = 0x3F;
+pub const PM4_INDIRECT_BUFFER_PFD: u8 = 0x37;
+pub const PM4_WAIT_FOR_IDLE: u8 = 0x26;
+pub const PM4_WAIT_REG_MEM: u8 = 0x3C;
+pub const PM4_REG_RMW: u8 = 0x21;
+pub const PM4_REG_TO_MEM: u8 = 0x3E;
+pub const PM4_MEM_WRITE: u8 = 0x3D;
+pub const PM4_COND_WRITE: u8 = 0x45;
+pub const PM4_EVENT_WRITE: u8 = 0x46;
+pub const PM4_EVENT_WRITE_SHD: u8 = 0x58;
+pub const PM4_EVENT_WRITE_EXT: u8 = 0x5A;
+pub const PM4_EVENT_WRITE_ZPD: u8 = 0x5B;
+pub const PM4_DRAW_INDX: u8 = 0x22;
+pub const PM4_DRAW_INDX_2: u8 = 0x36;
+pub const PM4_VIZ_QUERY: u8 = 0x23;
+pub const PM4_SET_CONSTANT: u8 = 0x2D;
+pub const PM4_SET_CONSTANT2: u8 = 0x55;
+pub const PM4_SET_SHADER_CONSTANTS: u8 = 0x56;
+pub const PM4_LOAD_ALU_CONSTANT: u8 = 0x2F;
+pub const PM4_IM_LOAD: u8 = 0x27;
+pub const PM4_IM_LOAD_IMMEDIATE: u8 = 0x2B;
+pub const PM4_LOAD_CONSTANT_CONTEXT: u8 = 0x2E;
+pub const PM4_INVALIDATE_STATE: u8 = 0x3B;
+pub const PM4_INTERRUPT: u8 = 0x54;
+pub const PM4_SET_SHADER_BASES: u8 = 0x4A;
+pub const PM4_SET_BIN_MASK_LO: u8 = 0x60;
+pub const PM4_SET_BIN_MASK_HI: u8 = 0x61;
+pub const PM4_SET_BIN_SELECT_LO: u8 = 0x62;
+pub const PM4_SET_BIN_SELECT_HI: u8 = 0x63;
+pub const PM4_SET_BIN_MASK: u8 = 0x50;
+pub const PM4_SET_BIN_SELECT: u8 = 0x51;
+pub const PM4_CONTEXT_UPDATE: u8 = 0x5E;
+/// Xenia-specific: `VdSwap` writes this to trigger a present.
+pub const PM4_XE_SWAP: u8 = 0x64;
+
+/// Human-readable name for a Type-3 opcode. Used for tracing spans.
+pub fn type3_opcode_name(op: u8) -> &'static str {
+    match op {
+        PM4_ME_INIT => "ME_INIT",
+        PM4_NOP => "NOP",
+        PM4_INDIRECT_BUFFER => "INDIRECT_BUFFER",
+        PM4_INDIRECT_BUFFER_PFD => "INDIRECT_BUFFER_PFD",
+        PM4_WAIT_FOR_IDLE => "WAIT_FOR_IDLE",
+        PM4_WAIT_REG_MEM => "WAIT_REG_MEM",
+        PM4_REG_RMW => "REG_RMW",
+        PM4_REG_TO_MEM => "REG_TO_MEM",
+        PM4_MEM_WRITE => "MEM_WRITE",
+        PM4_COND_WRITE => "COND_WRITE",
+        PM4_EVENT_WRITE => "EVENT_WRITE",
+        PM4_EVENT_WRITE_SHD => "EVENT_WRITE_SHD",
+        PM4_EVENT_WRITE_EXT => "EVENT_WRITE_EXT",
+        PM4_EVENT_WRITE_ZPD => "EVENT_WRITE_ZPD",
+        PM4_DRAW_INDX => "DRAW_INDX",
+        PM4_DRAW_INDX_2 => "DRAW_INDX_2",
+        PM4_VIZ_QUERY => "VIZ_QUERY",
+        PM4_SET_CONSTANT => "SET_CONSTANT",
+        PM4_SET_CONSTANT2 => "SET_CONSTANT2",
+        PM4_SET_SHADER_CONSTANTS => "SET_SHADER_CONSTANTS",
+        PM4_LOAD_ALU_CONSTANT => "LOAD_ALU_CONSTANT",
+        PM4_LOAD_CONSTANT_CONTEXT => "LOAD_CONSTANT_CONTEXT",
+        PM4_IM_LOAD => "IM_LOAD",
+        PM4_IM_LOAD_IMMEDIATE => "IM_LOAD_IMMEDIATE",
+        PM4_INVALIDATE_STATE => "INVALIDATE_STATE",
+        PM4_INTERRUPT => "INTERRUPT",
+        PM4_SET_SHADER_BASES => "SET_SHADER_BASES",
+        PM4_SET_BIN_MASK_LO => "SET_BIN_MASK_LO",
+        PM4_SET_BIN_MASK_HI => "SET_BIN_MASK_HI",
+        PM4_SET_BIN_SELECT_LO => "SET_BIN_SELECT_LO",
+        PM4_SET_BIN_SELECT_HI => "SET_BIN_SELECT_HI",
+        PM4_SET_BIN_MASK => "SET_BIN_MASK",
+        PM4_SET_BIN_SELECT => "SET_BIN_SELECT",
+        PM4_CONTEXT_UPDATE => "CONTEXT_UPDATE",
+        PM4_XE_SWAP => "XE_SWAP",
+        _ => "UNKNOWN",
+    }
+}
+
+/// Decoded single PM4 packet header.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct PacketHeader {
+    pub kind: PacketKind,
+    /// Total size of the packet (including header) in dwords.
+    pub total_dwords: u32,
+}
+
+/// Classification of a PM4 packet.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PacketKind {
+    /// Type-0 register-write run. `base_index` is the first register index
+    /// (the register offset / 4). `write_one` is true if all `count` data
+    /// dwords write to the same register.
+    Type0 {
+        base_index: u32,
+        count: u32,
+        write_one: bool,
+    },
+    /// Type-1 two-register write.
+    Type1 { reg_index_1: u32, reg_index_2: u32 },
+    /// Type-2 NOP (a single skipped dword).
+    Type2,
+    /// Type-3 command.
+    Type3 {
+        opcode: u8,
+        count: u32,
+        predicated: bool,
+    },
+}
+
+/// Decode a single PM4 packet header.
+pub fn decode(header: u32) -> PacketHeader {
+    match header >> 30 {
+        0 => {
+            let count = ((header >> 16) & 0x3FFF) + 1;
+            PacketHeader {
+                kind: PacketKind::Type0 {
+                    base_index: header & 0x7FFF,
+                    count,
+                    write_one: (header >> 15) & 1 != 0,
+                },
+                total_dwords: 1 + count,
+            }
+        }
+        1 => PacketHeader {
+            kind: PacketKind::Type1 {
+                reg_index_1: header & 0x7FF,
+                reg_index_2: (header >> 11) & 0x7FF,
+            },
+            total_dwords: 3,
+        },
+        2 => PacketHeader {
+            kind: PacketKind::Type2,
+            total_dwords: 1,
+        },
+        3 => {
+            let count = ((header >> 16) & 0x3FFF) + 1;
+            PacketHeader {
+                kind: PacketKind::Type3 {
+                    opcode: ((header >> 8) & 0x7F) as u8,
+                    count,
+                    predicated: (header & 1) != 0,
+                },
+                total_dwords: 1 + count,
+            }
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn type2_is_one_dword() {
+        // 0x80000000 == type 2 header (bits 31:30 = 10)
+        let hdr = decode(0x8000_0000);
+        assert_eq!(hdr.kind, PacketKind::Type2);
+        assert_eq!(hdr.total_dwords, 1);
+    }
+
+    #[test]
+    fn type0_count_is_inclusive() {
+        // count field (bits 29:16) = 5 → 6 data dwords. base_index = 0x100.
+        // write_one = 0.
+        let hdr = decode((5 << 16) | 0x100);
+        match hdr.kind {
+            PacketKind::Type0 {
+                base_index,
+                count,
+                write_one,
+            } => {
+                assert_eq!(base_index, 0x100);
+                assert_eq!(count, 6);
+                assert!(!write_one);
+            }
+            _ => panic!("expected Type0"),
+        }
+        assert_eq!(hdr.total_dwords, 7);
+    }
+
+    #[test]
+    fn type3_swap_packet() {
+        // Build the exact header canary's VdSwap emits:
+        //   MakePacketType3(PM4_XE_SWAP, 4) → ((3<<30) | ((4-1)<<16) | (0x64<<8))
+        let hdr_word = (3u32 << 30) | ((4u32 - 1) << 16) | ((PM4_XE_SWAP as u32) << 8);
+        let hdr = decode(hdr_word);
+        match hdr.kind {
+            PacketKind::Type3 {
+                opcode,
+                count,
+                predicated,
+            } => {
+                assert_eq!(opcode, PM4_XE_SWAP);
+                assert_eq!(count, 4);
+                assert!(!predicated);
+            }
+            _ => panic!("expected Type3"),
+        }
+        assert_eq!(hdr.total_dwords, 5);
+    }
+
+    #[test]
+    fn opcode_names_are_present_for_common_ops() {
+        assert_eq!(type3_opcode_name(PM4_NOP), "NOP");
+        assert_eq!(type3_opcode_name(PM4_DRAW_INDX), "DRAW_INDX");
+        assert_eq!(type3_opcode_name(PM4_XE_SWAP), "XE_SWAP");
+        assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
+        assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
+    }
+}
diff --git a/crates/xenia-gpu/src/primitive.rs b/crates/xenia-gpu/src/primitive.rs
new file mode 100644
index 0000000..6bbafed
--- /dev/null
+++ b/crates/xenia-gpu/src/primitive.rs
@@ -0,0 +1,229 @@
+//! Primitive processor — normalize Xenos primitives into host-GPU forms.
+//!
+//! wgpu only exposes `PrimitiveTopology::{PointList, LineList, LineStrip,
+//! TriangleList, TriangleStrip}`. For everything else (fans, quads,
+//! rectangles) we rewrite indices on the CPU side so the host just sees a
+//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
+//!
+//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
+//! (list, strip, fan). Rectangle + quad expansions are stubs logged via
+//! `tracing::warn!` for later.
+
+use crate::draw_state::{IndexSize, PrimitiveType};
+
+/// Host primitive topology — a subset of wgpu's that we commit to.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum HostTopology {
+    PointList,
+    LineList,
+    LineStrip,
+    TriangleList,
+    TriangleStrip,
+}
+
+/// Result of primitive processing.
+#[derive(Debug, Clone)]
+pub struct ProcessedPrimitive {
+    pub topology: HostTopology,
+    /// When the Xenos primitive needed client-side rewriting (fans, quads),
+    /// this buffer holds the rewritten 16-bit or 32-bit index sequence.
+    /// `None` means the input index buffer is usable as-is.
+    pub rewritten_indices: Option<Vec<u32>>,
+    /// Post-processing vertex count — equals the input count when indices
+    /// pass through unchanged.
+    pub host_vertex_count: u32,
+    /// `true` if we rejected the primitive (unsupported shape) and the
+    /// caller should skip this draw. Logged via `tracing::warn!`.
+    pub rejected: bool,
+}
+
+/// Normalize a draw.
+///
+/// `indices` is `None` for `AutoIndex` draws; otherwise it's the decoded
+/// index stream (already endian-converted / widened to u32 by the caller).
+pub fn process(
+    primitive: PrimitiveType,
+    vertex_count: u32,
+    indices: Option<&[u32]>,
+) -> ProcessedPrimitive {
+    match primitive {
+        PrimitiveType::PointList => pass_through(HostTopology::PointList, vertex_count),
+        PrimitiveType::LineList => pass_through(HostTopology::LineList, vertex_count),
+        PrimitiveType::LineStrip => pass_through(HostTopology::LineStrip, vertex_count),
+        PrimitiveType::TriangleList => pass_through(HostTopology::TriangleList, vertex_count),
+        PrimitiveType::TriangleStrip => pass_through(HostTopology::TriangleStrip, vertex_count),
+        PrimitiveType::TriangleFan => expand_fan(indices, vertex_count),
+        PrimitiveType::RectangleList => expand_rectangles(indices, vertex_count),
+        PrimitiveType::QuadList => expand_quads(indices, vertex_count),
+        PrimitiveType::None | PrimitiveType::Unknown(_) => {
+            tracing::warn!(?primitive, "gpu: rejecting unsupported primitive");
+            metrics::counter!("gpu.primitive.rejected").increment(1);
+            ProcessedPrimitive {
+                topology: HostTopology::TriangleList,
+                rewritten_indices: None,
+                host_vertex_count: 0,
+                rejected: true,
+            }
+        }
+    }
+}
+
+fn pass_through(topology: HostTopology, vertex_count: u32) -> ProcessedPrimitive {
+    ProcessedPrimitive {
+        topology,
+        rewritten_indices: None,
+        host_vertex_count: vertex_count,
+        rejected: false,
+    }
+}
+
+/// Convert a triangle fan to a triangle list. Fan indices `[0, 1, 2, 3, 4]`
+/// expand to triangles `(0,1,2), (0,2,3), (0,3,4)` — 3 × (n-2) host indices.
+fn expand_fan(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
+    if vertex_count < 3 {
+        return ProcessedPrimitive {
+            topology: HostTopology::TriangleList,
+            rewritten_indices: Some(Vec::new()),
+            host_vertex_count: 0,
+            rejected: false,
+        };
+    }
+    let mut out = Vec::with_capacity(3 * (vertex_count as usize - 2));
+    let get = |i: u32| -> u32 {
+        match indices {
+            Some(buf) => buf[i as usize],
+            None => i,
+        }
+    };
+    let apex = get(0);
+    for i in 1..vertex_count.saturating_sub(1) {
+        out.push(apex);
+        out.push(get(i));
+        out.push(get(i + 1));
+    }
+    let host_vertex_count = out.len() as u32;
+    ProcessedPrimitive {
+        topology: HostTopology::TriangleList,
+        rewritten_indices: Some(out),
+        host_vertex_count,
+        rejected: false,
+    }
+}
+
+/// Convert a quad list (groups of 4) to a triangle list (groups of 6).
+fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
+    let quad_count = vertex_count / 4;
+    let mut out = Vec::with_capacity(6 * quad_count as usize);
+    let get = |i: u32| -> u32 {
+        match indices {
+            Some(buf) => buf[i as usize],
+            None => i,
+        }
+    };
+    for q in 0..quad_count {
+        let base = q * 4;
+        let a = get(base);
+        let b = get(base + 1);
+        let c = get(base + 2);
+        let d = get(base + 3);
+        out.extend_from_slice(&[a, b, c, a, c, d]);
+    }
+    let host_vertex_count = out.len() as u32;
+    ProcessedPrimitive {
+        topology: HostTopology::TriangleList,
+        rewritten_indices: Some(out),
+        host_vertex_count,
+        rejected: false,
+    }
+}
+
+/// Rectangle lists: a Xenos-specific primitive where each group of 3
+/// vertices defines a right-angle rectangle by its three non-repeated
+/// corners (the 4th is derived). The uber-shader doesn't support this yet;
+/// the ucode translator will emulate it as a geometry-stage fake. For P3
+/// we emit an empty draw.
+fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
+    tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
+    metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
+    ProcessedPrimitive {
+        topology: HostTopology::TriangleList,
+        rewritten_indices: Some(Vec::new()),
+        host_vertex_count: 0,
+        rejected: true,
+    }
+}
+
+/// Widen a u16 index buffer to u32. The primitive processor normalizes to
+/// u32 so downstream wgpu pipeline descriptors stay simple.
+pub fn widen_indices(raw: &[u8], size: IndexSize, count: u32) -> Vec<u32> {
+    let mut out = Vec::with_capacity(count as usize);
+    match size {
+        IndexSize::Sixteen => {
+            for i in 0..count as usize {
+                let off = i * 2;
+                if off + 2 > raw.len() {
+                    break;
+                }
+                // Xenos indices are big-endian on the wire.
+                let be = u16::from_be_bytes([raw[off], raw[off + 1]]);
+                out.push(be as u32);
+            }
+        }
+        IndexSize::ThirtyTwo => {
+            for i in 0..count as usize {
+                let off = i * 4;
+                if off + 4 > raw.len() {
+                    break;
+                }
+                let be = u32::from_be_bytes([raw[off], raw[off + 1], raw[off + 2], raw[off + 3]]);
+                out.push(be);
+            }
+        }
+    }
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn triangle_list_passes_through() {
+        let p = process(PrimitiveType::TriangleList, 6, None);
+        assert_eq!(p.topology, HostTopology::TriangleList);
+        assert!(p.rewritten_indices.is_none());
+        assert_eq!(p.host_vertex_count, 6);
+        assert!(!p.rejected);
+    }
+
+    #[test]
+    fn fan_to_list_expands_correctly() {
+        // Fan of 5 vertices → triangles (0,1,2), (0,2,3), (0,3,4)
+        let p = process(PrimitiveType::TriangleFan, 5, None);
+        let idx = p.rewritten_indices.unwrap();
+        assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 0, 3, 4]);
+        assert_eq!(p.topology, HostTopology::TriangleList);
+        assert_eq!(p.host_vertex_count, 9);
+    }
+
+    #[test]
+    fn quad_list_expansion() {
+        let p = process(PrimitiveType::QuadList, 8, None);
+        let idx = p.rewritten_indices.unwrap();
+        assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
+    }
+
+    #[test]
+    fn widen_u16_indices_big_endian() {
+        // 3 indices [1, 2, 0x1234] in BE u16.
+        let raw = [0, 1, 0, 2, 0x12, 0x34];
+        let out = widen_indices(&raw, IndexSize::Sixteen, 3);
+        assert_eq!(out, vec![1, 2, 0x1234]);
+    }
+
+    #[test]
+    fn rejects_unknown_primitive() {
+        let p = process(PrimitiveType::Unknown(0x2A), 3, None);
+        assert!(p.rejected);
+    }
+}
diff --git a/crates/xenia-gpu/src/render_target_cache.rs b/crates/xenia-gpu/src/render_target_cache.rs
new file mode 100644
index 0000000..43ee25d
--- /dev/null
+++ b/crates/xenia-gpu/src/render_target_cache.rs
@@ -0,0 +1,384 @@
+//! EDRAM tile book + render-target key bookkeeping.
+//!
+//! Mirrors `xenia-canary/src/xenia/gpu/render_target_cache.h` at the data-
+//! structure level. Xenos's 10 MiB EDRAM is divided into 2048 "tiles" of
+//! 80×16 samples each; render targets claim a contiguous range of those
+//! tiles based on `(base_tiles, pitch_tiles_at_32bpp, msaa_samples, format,
+//! is_depth)`. Two render targets with overlapping tile ranges share the
+//! underlying EDRAM — canary tracks this with per-tile "Host vs Shared"
+//! ownership, which is what this module's `TileOwner` captures.
+//!
+//! P4 ships the **bookkeeping**. Actual host texture allocation per key (so
+//! the host can draw into a wgpu texture matching the guest's RT) is left to
+//! a future host-side cache built on top of this module; the same for
+//! format-conversion compute shaders (the plan's P5 territory).
+
+use std::collections::HashMap;
+
+/// Number of EDRAM tiles on Xenos. Matches canary's `xenos::kEdramTileCount`.
+pub const EDRAM_TILE_COUNT: usize = 2048;
+
+/// MSAA sample count encoded into [`RenderTargetKey`]. Canary uses this as
+/// `xenos::MsaaSamples` (1×/2×/4×).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum MsaaSamples {
+    X1 = 0,
+    X2 = 1,
+    X4 = 2,
+}
+
+impl MsaaSamples {
+    pub fn from_raw(raw: u32) -> Self {
+        match raw & 0x3 {
+            1 => MsaaSamples::X2,
+            2 => MsaaSamples::X4,
+            _ => MsaaSamples::X1,
+        }
+    }
+    pub fn count(self) -> u32 {
+        1u32 << (self as u32)
+    }
+}
+
+/// The packed EDRAM render-target identity. Bit layout matches
+/// `render_target_cache.h:251-321`'s `RenderTargetKey` union (26 bits used,
+/// stored as a single `u32` so it hashes cheaply). `pitch_tiles_at_32bpp`
+/// is always the 32bpp-equivalent pitch — 64bpp targets halve their tile
+/// pitch from the nominal tile grid (canary's `GetPitchTiles()` handles
+/// that).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct RenderTargetKey {
+    pub base_tiles: u16,              // [0..2048)
+    pub pitch_tiles_at_32bpp: u16,    // 0..=256 in practice
+    pub msaa_samples: MsaaSamples,
+    pub is_depth: bool,
+    /// Color format: `xenos::ColorRenderTargetFormat` when !is_depth.
+    /// Depth format: `xenos::DepthRenderTargetFormat` when is_depth.
+    pub resource_format: u8,          // 4 bits
+}
+
+impl RenderTargetKey {
+    /// Pack into canary's 26-bit layout. Useful for compact storage /
+    /// hashing when we add a LRU cache later on.
+    pub fn pack(&self) -> u32 {
+        (self.base_tiles as u32 & 0x7FF)
+            | (((self.pitch_tiles_at_32bpp as u32) & 0xFF) << 11)
+            | (((self.msaa_samples as u32) & 0x3) << 19)
+            | ((self.is_depth as u32) << 21)
+            | (((self.resource_format as u32) & 0xF) << 22)
+    }
+
+    pub fn unpack(raw: u32) -> Self {
+        Self {
+            base_tiles: (raw & 0x7FF) as u16,
+            pitch_tiles_at_32bpp: ((raw >> 11) & 0xFF) as u16,
+            msaa_samples: MsaaSamples::from_raw((raw >> 19) & 0x3),
+            is_depth: ((raw >> 21) & 1) != 0,
+            resource_format: ((raw >> 22) & 0xF) as u8,
+        }
+    }
+
+    /// How many EDRAM tiles the whole surface occupies (rough estimate; a
+    /// real height-aware calc needs viewport info). We conservatively use
+    /// `pitch_tiles_at_32bpp * 1` until a draw tells us otherwise; callers
+    /// that know the height can call [`tile_footprint_with_height`].
+    pub fn tile_pitch(&self) -> u16 {
+        // 64bpp formats pack two 32bpp tiles into one 64bpp tile.
+        if self.is_64bpp() {
+            self.pitch_tiles_at_32bpp / 2
+        } else {
+            self.pitch_tiles_at_32bpp
+        }
+    }
+
+    pub fn is_64bpp(&self) -> bool {
+        if self.is_depth {
+            false
+        } else {
+            // Canary: `ColorRenderTargetFormat::{k_16_16_16_16,
+            // k_16_16_16_16_FLOAT, k_32_32_FLOAT}` are 64bpp; indices 4, 5, 7
+            // in the enum. (Kept narrow because the enum is 4 bits wide.)
+            matches!(self.resource_format, 4 | 5 | 7)
+        }
+    }
+
+    /// Tiles claimed by this RT if its surface height is `rows_of_tiles`
+    /// (i.e. `ceil(height_in_samples / 16)`).
+    pub fn tile_footprint_with_height(&self, rows_of_tiles: u16) -> u16 {
+        self.tile_pitch().saturating_mul(rows_of_tiles)
+    }
+}
+
+/// Who currently owns a tile of EDRAM.
+///
+/// `None`: untouched; free to claim.
+/// `Host(idx)`: a single RT has exclusive ownership.
+/// `Shared(idx)`: two+ RT keys map to the same tile (usually after a
+/// format change without an intervening clear); the named RT is the most
+/// recent owner whose format should be honored for readback.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Default)]
+pub enum TileOwner {
+    #[default]
+    None,
+    Host(u32),
+    Shared(u32),
+}
+
+
+/// Bookkeeping across the 2048 EDRAM tiles. Not a GPU resource by itself —
+/// tracks which render target (by index) currently owns each tile.
+pub struct EdramTileBook {
+    tiles: Vec<TileOwner>,
+}
+
+impl Default for EdramTileBook {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl EdramTileBook {
+    pub fn new() -> Self {
+        Self {
+            tiles: vec![TileOwner::None; EDRAM_TILE_COUNT],
+        }
+    }
+
+    pub fn who_owns(&self, tile: u16) -> TileOwner {
+        self.tiles
+            .get(tile as usize)
+            .copied()
+            .unwrap_or(TileOwner::None)
+    }
+
+    /// Mark `[base, base+count)` as owned by `rt_idx`. Pre-existing owners
+    /// in the range are demoted to `Shared` (format reinterpretation).
+    /// Returns the number of tiles newly claimed (not previously the same
+    /// owner).
+    pub fn claim(&mut self, base: u16, count: u16, rt_idx: u32) -> u32 {
+        let mut newly_claimed = 0u32;
+        for i in 0..(count as usize) {
+            let t = base as usize + i;
+            if t >= self.tiles.len() {
+                break;
+            }
+            let prev = self.tiles[t];
+            let already_ours = matches!(
+                prev,
+                TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
+            );
+            match prev {
+                TileOwner::None => {
+                    self.tiles[t] = TileOwner::Host(rt_idx);
+                }
+                TileOwner::Host(idx) if idx == rt_idx => {
+                    // re-claim of same RT — no-op
+                }
+                _ => {
+                    // Format change / shared range.
+                    self.tiles[t] = TileOwner::Shared(rt_idx);
+                }
+            }
+            if !already_ours {
+                newly_claimed += 1;
+            }
+        }
+        newly_claimed
+    }
+
+    /// Drop `rt_idx` from any tile it owns; tiles revert to `None` unless
+    /// they were `Shared(rt_idx)` (in which case they also revert to
+    /// `None`; the other sharer's ownership is lost — `release` is a
+    /// coarse "this RT is gone" operation).
+    pub fn release(&mut self, rt_idx: u32) {
+        for t in self.tiles.iter_mut() {
+            if matches!(
+                *t,
+                TileOwner::Host(idx) | TileOwner::Shared(idx) if idx == rt_idx
+            ) {
+                *t = TileOwner::None;
+            }
+        }
+    }
+
+    /// Count tiles currently assigned to any RT (Host or Shared).
+    pub fn occupied_count(&self) -> u32 {
+        self.tiles
+            .iter()
+            .filter(|o| !matches!(o, TileOwner::None))
+            .count() as u32
+    }
+}
+
+/// Minimal per-RT descriptor stored alongside the tile book. P5's texture
+/// cache will expand this with the actual wgpu texture handle.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct RtDescriptor {
+    pub key: RenderTargetKey,
+    /// Number of times this key has been bound since creation. Rough
+    /// proxy for activity / hot-RT identification.
+    pub bind_count: u32,
+    /// Draw index on first bind — handy for debugging divergence.
+    pub first_draw_index: u32,
+}
+
+/// Top-level cache: maps packed keys to small descriptors + the tile book.
+pub struct RenderTargetCache {
+    next_idx: u32,
+    by_key: HashMap<u32, u32>,
+    descriptors: HashMap<u32, RtDescriptor>,
+    pub tiles: EdramTileBook,
+}
+
+impl Default for RenderTargetCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RenderTargetCache {
+    pub fn new() -> Self {
+        Self {
+            next_idx: 0,
+            by_key: HashMap::new(),
+            descriptors: HashMap::new(),
+            tiles: EdramTileBook::new(),
+        }
+    }
+
+    /// Look up or allocate an RT descriptor for `key`. `draw_index` is the
+    /// current monotonic draw counter — recorded on first insert for
+    /// provenance.
+    pub fn bind(&mut self, key: RenderTargetKey, draw_index: u32) -> u32 {
+        let packed = key.pack();
+        if let Some(&idx) = self.by_key.get(&packed) {
+            if let Some(d) = self.descriptors.get_mut(&idx) {
+                d.bind_count += 1;
+            }
+            return idx;
+        }
+        let idx = self.next_idx;
+        self.next_idx += 1;
+        self.by_key.insert(packed, idx);
+        self.descriptors.insert(
+            idx,
+            RtDescriptor {
+                key,
+                bind_count: 1,
+                first_draw_index: draw_index,
+            },
+        );
+        idx
+    }
+
+    pub fn descriptor(&self, idx: u32) -> Option<&RtDescriptor> {
+        self.descriptors.get(&idx)
+    }
+
+    pub fn len(&self) -> usize {
+        self.descriptors.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.descriptors.is_empty()
+    }
+
+    /// Claim tiles for the descriptor at `rt_idx`. `height_tiles` is
+    /// `ceil(viewport_height_samples / 16)` — callers supply it because
+    /// the key itself doesn't carry height.
+    pub fn claim_tiles(&mut self, rt_idx: u32, height_tiles: u16) -> u32 {
+        if let Some(d) = self.descriptors.get(&rt_idx) {
+            let footprint = d.key.tile_footprint_with_height(height_tiles);
+            self.tiles.claim(d.key.base_tiles, footprint, rt_idx)
+        } else {
+            0
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn render_target_key_pack_roundtrip() {
+        let k = RenderTargetKey {
+            base_tiles: 1600,
+            pitch_tiles_at_32bpp: 80,
+            msaa_samples: MsaaSamples::X4,
+            is_depth: true,
+            resource_format: 0b1010,
+        };
+        let packed = k.pack();
+        let round = RenderTargetKey::unpack(packed);
+        assert_eq!(round, k);
+    }
+
+    #[test]
+    fn tile_book_claim_marks_owners() {
+        let mut book = EdramTileBook::new();
+        assert_eq!(book.occupied_count(), 0);
+        let new_count = book.claim(100, 10, 42);
+        assert_eq!(new_count, 10);
+        assert_eq!(book.who_owns(100), TileOwner::Host(42));
+        assert_eq!(book.who_owns(109), TileOwner::Host(42));
+        assert_eq!(book.who_owns(110), TileOwner::None);
+    }
+
+    #[test]
+    fn tile_book_claim_demotes_to_shared() {
+        let mut book = EdramTileBook::new();
+        book.claim(100, 10, 1);
+        book.claim(105, 10, 2);
+        // Overlap: tiles 105..110 should be Shared(2); 100..105 stay Host(1);
+        // tiles 110..115 are fresh Host(2).
+        assert_eq!(book.who_owns(104), TileOwner::Host(1));
+        assert_eq!(book.who_owns(105), TileOwner::Shared(2));
+        assert_eq!(book.who_owns(110), TileOwner::Host(2));
+    }
+
+    #[test]
+    fn tile_book_release_frees_all() {
+        let mut book = EdramTileBook::new();
+        book.claim(0, 50, 7);
+        book.release(7);
+        assert_eq!(book.occupied_count(), 0);
+    }
+
+    #[test]
+    fn rt_cache_bind_is_idempotent_by_key() {
+        let mut cache = RenderTargetCache::new();
+        let k = RenderTargetKey {
+            base_tiles: 0,
+            pitch_tiles_at_32bpp: 80,
+            msaa_samples: MsaaSamples::X1,
+            is_depth: false,
+            resource_format: 0,
+        };
+        let a = cache.bind(k, 0);
+        let b = cache.bind(k, 1);
+        assert_eq!(a, b);
+        let d = cache.descriptor(a).unwrap();
+        assert_eq!(d.bind_count, 2);
+        assert_eq!(d.first_draw_index, 0);
+    }
+
+    #[test]
+    fn rt_cache_claim_tiles_tracks_footprint() {
+        let mut cache = RenderTargetCache::new();
+        let k = RenderTargetKey {
+            base_tiles: 0,
+            pitch_tiles_at_32bpp: 80, // 32bpp 1280-wide target
+            msaa_samples: MsaaSamples::X1,
+            is_depth: false,
+            resource_format: 0,
+        };
+        let idx = cache.bind(k, 0);
+        // 720 samples tall / 16 per tile = 45 rows → 80 * 45 = 3600 tiles;
+        // caps out at 2048. Verify clamping.
+        let newly = cache.claim_tiles(idx, 45);
+        assert_eq!(newly, 2048);
+        assert_eq!(cache.tiles.occupied_count(), 2048);
+    }
+}
diff --git a/crates/xenia-gpu/src/resolve.rs b/crates/xenia-gpu/src/resolve.rs
new file mode 100644
index 0000000..e81bfe0
--- /dev/null
+++ b/crates/xenia-gpu/src/resolve.rs
@@ -0,0 +1,1260 @@
+//! EDRAM→guest-memory resolve byte copy.
+//!
+//! Fires from [`crate::gpu_system::GpuSystem::handle_event_initiator`] on
+//! `TILE_FLUSH` (event 15). Reads samples out of the shadow EDRAM at the
+//! source tile range, applies the `Endian128` byte swap, and writes tiled
+//! u32 samples into guest memory via a 32bpp bitwise-equivalent fast path
+//! (Canary `IsColorResolveFormatBitwiseEquivalent` — `xenos.h:614-639`).
+//!
+//! Ground truth: `xenia-canary/src/xenia/gpu/draw_util.cc:1102-1370` and
+//! `xenos.h:1077-1114` (`GpuSwapInline`), `1039-1052` (`CopySampleSelect`).
+//!
+//! ## Endian ordering
+//!
+//! [`xenia_memory::access::MemoryAccess::write_u32`] stores big-endian
+//! bytes (it calls `val.to_be_bytes()`). The Xenon CPU sees memory as big-
+//! endian u32s, so `write_u32(addr, 0x11223344)` lands `[0x11, 0x22, 0x33,
+//! 0x44]` in memory — which is the `kNone` (no swap) byte order from the
+//! host's view of the sample.
+//!
+//! The resolve has an `Endian128` mode controlled by
+//! `RB_COPY_DEST_INFO.copy_dest_endian`: games typically set `k8in32` so
+//! that later texture fetches see little-endian bytes. We therefore
+//! pre-swap the sample *before* `write_u32` so the big-endian store yields
+//! the desired byte order in memory.
+
+use crate::draw_state::{ResolveInfo, ResolveSource};
+use crate::edram::ShadowEdram;
+use crate::render_target_cache::MsaaSamples;
+use crate::tiled_address::{align_pitch_to_macro_tile, tiled_2d_offset};
+
+use xenia_memory::access::MemoryAccess;
+
+/// Stats returned from one resolve copy. Aggregated by the caller into
+/// `GpuStats` counters so the HUD can surface them.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct ResolveCopyStats {
+    /// Number of 32bpp samples actually written to guest memory.
+    pub samples_written: u32,
+    /// Was the format path supported? `false` means we skipped.
+    pub supported: bool,
+}
+
+/// `xenos::CopyCommand::kNull` = 3 — resolve emits no copy (clear-only).
+pub const COPY_COMMAND_NULL: u8 = 3;
+
+/// Sanitized sample selector (`xenos::CopySampleSelect`, `xenos.h:1039`).
+/// We keep the *raw* enum value in `ResolveInfo` and pass a sanitized one
+/// here so callers can match on the effective mode rather than re-applying
+/// the MSAA/depth sanitation rules from Canary `draw_util.cc:839-876`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CopySampleSelect {
+    K0 = 0,
+    K1 = 1,
+    K2 = 2,
+    K3 = 3,
+    K01 = 4,
+    K23 = 5,
+    K0123 = 6,
+}
+
+impl CopySampleSelect {
+    pub fn from_raw(raw: u8) -> Self {
+        match raw & 0x7 {
+            1 => Self::K1,
+            2 => Self::K2,
+            3 => Self::K3,
+            4 => Self::K01,
+            5 => Self::K23,
+            6 | 7 => Self::K0123,
+            _ => Self::K0,
+        }
+    }
+
+    /// Single-sample picks return `Some(index 0..=3)`; averaging picks
+    /// return `None` (caller must synthesize via per-sample reads).
+    pub fn single_sample_index(self) -> Option<u8> {
+        match self {
+            Self::K0 => Some(0),
+            Self::K1 => Some(1),
+            Self::K2 => Some(2),
+            Self::K3 => Some(3),
+            _ => None,
+        }
+    }
+
+    /// `IsSingleCopySampleSelected` from `xenos.h:1049`.
+    pub fn is_single_sample(self) -> bool {
+        self.single_sample_index().is_some()
+    }
+}
+
+/// `SanitizeCopySampleSelect` (Canary `draw_util.cc:839-876`). MSAA
+/// modes + depth limit which sample selectors are valid; invalid ones
+/// are silently remapped. Returning the sanitized enum lets the resolve
+/// loop assume a single-sample pick for 1x MSAA, etc.
+pub fn sanitize_sample_select(
+    raw: u8,
+    msaa: MsaaSamples,
+    is_depth: bool,
+) -> CopySampleSelect {
+    let select = CopySampleSelect::from_raw(raw);
+    match msaa {
+        MsaaSamples::X1 => {
+            // Only sample 0 exists. Averaging modes → k0; >k0123 clamp.
+            match select {
+                CopySampleSelect::K0 => CopySampleSelect::K0,
+                _ => CopySampleSelect::K0,
+            }
+        }
+        MsaaSamples::X2 => {
+            // Samples 0 and 1 exist (stacked vertically). k2 → k0, k3 → k1;
+            // k23 → k01. Depth cannot average.
+            match select {
+                CopySampleSelect::K0 => CopySampleSelect::K0,
+                CopySampleSelect::K1 => CopySampleSelect::K1,
+                CopySampleSelect::K2 => CopySampleSelect::K0,
+                CopySampleSelect::K3 => CopySampleSelect::K1,
+                CopySampleSelect::K01 | CopySampleSelect::K23 | CopySampleSelect::K0123 => {
+                    if is_depth {
+                        CopySampleSelect::K0
+                    } else {
+                        CopySampleSelect::K01
+                    }
+                }
+            }
+        }
+        MsaaSamples::X4 => {
+            // All single-samples valid. Depth cannot average → pick
+            // representative single sample (k01→k0, k23→k2, k0123→k0).
+            if is_depth {
+                match select {
+                    CopySampleSelect::K01 => CopySampleSelect::K0,
+                    CopySampleSelect::K23 => CopySampleSelect::K2,
+                    CopySampleSelect::K0123 => CopySampleSelect::K0,
+                    other => other,
+                }
+            } else {
+                select
+            }
+        }
+    }
+}
+
+/// Sample-index to in-pixel (dx, dy) offset for the current MSAA mode.
+/// Matches the standard Xbox 360 MSAA sample layout (Canary
+/// `texture_util::GetMsaaSampleLocation` / the shader constants). For 1x,
+/// always `(0, 0)`.
+///
+/// * 2x MSAA: sample 0 = top line, sample 1 = bottom line.
+/// * 4x MSAA: 2×2 grid `{(0,0),(1,0),(0,1),(1,1)}`.
+#[inline]
+fn sample_offset_in_pixel(sample_idx: u8, msaa: MsaaSamples) -> (u32, u32) {
+    match msaa {
+        MsaaSamples::X1 => (0, 0),
+        MsaaSamples::X2 => (0, (sample_idx & 1) as u32),
+        MsaaSamples::X4 => ((sample_idx & 1) as u32, ((sample_idx >> 1) & 1) as u32),
+    }
+}
+
+/// Apply the `Endian128` byte swap to one 32-bit sample. Matches the cases
+/// inside `GpuSwapInline` plus the 64/128-bit variants from
+/// `xenos::Endian128`. The 64/128 modes cannot be expressed in a single u32
+/// so they fall through to `k8in32` and log at the call site.
+#[inline]
+pub fn apply_endian_128(value: u32, endian: u8) -> u32 {
+    match endian {
+        0 => value,
+        // k8in16: swap bytes within each 16-bit word.
+        1 => ((value & 0xFF00FF00) >> 8) | ((value & 0x00FF00FF) << 8),
+        // k8in32: full byte reversal.
+        2 => value.swap_bytes(),
+        // k16in32: swap 16-bit halves.
+        3 => value.rotate_left(16),
+        // k8in64 / k8in128: require cross-dword context. Approximate with
+        // k8in32 (byte-reverse each dword) so the bytes land in a sensible
+        // order; the caller logs the approximation.
+        4 | 5 => value.swap_bytes(),
+        _ => value,
+    }
+}
+
+/// `xenos::ColorFormat` values we use as destination formats for 32bpp
+/// resolves. Canary `xenos.h:582-609`.
+mod color_format {
+    pub const K_8_8_8_8: u8 = 6;
+    pub const K_2_10_10_10: u8 = 7;
+    pub const K_8_8_8_8_A: u8 = 14;
+    pub const K_16_16_FLOAT: u8 = 31;
+    pub const K_32_FLOAT: u8 = 36;
+    pub const K_8_8_8_8_AS_16_16_16_16: u8 = 50;
+    pub const K_2_10_10_10_AS_16_16_16_16: u8 = 54;
+    // ── 64bpp dest formats (Canary `xenos.h:582-609`) ──────────────────
+    /// `k_16_16_16_16` (4 channels × 16 bits, signed/unsigned variants
+    /// resolve identically — same bit layout).
+    pub const K_16_16_16_16: u8 = 26;
+    /// `k_16_16_16_16_FLOAT` (4 channels × half-float).
+    pub const K_16_16_16_16_FLOAT: u8 = 32;
+    /// `k_32_32_FLOAT` (R32 + G32, 64bpp). `xenos::TextureFormat = 37`.
+    pub const K_32_32_FLOAT: u8 = 37;
+    /// Depth textures (Canary `xenos::TextureFormat`).
+    pub const K_24_8: u8 = 22;
+    pub const K_24_8_FLOAT: u8 = 23;
+}
+
+/// 32-bit bitwise-equivalence check covering 32bpp color and depth resolves.
+/// Color side mirrors `xenos::IsColorResolveFormatBitwiseEquivalent`
+/// (`xenos.h:614-639`). Depth side maps `DepthRenderTargetFormat` to
+/// its textural form (`kD24S8 → k_24_8`, `kD24FS8 → k_24_8_FLOAT`).
+pub fn is_32bpp_bitwise_equivalent(
+    source: ResolveSource,
+    source_is_64bpp: bool,
+    source_format: u8,
+    dest_format: u8,
+) -> bool {
+    if source_is_64bpp {
+        return false;
+    }
+    match source {
+        ResolveSource::Color(_) => {
+            use color_format as cf;
+            match source_format {
+                // k_8_8_8_8 (0) and k_8_8_8_8_GAMMA (1). Gamma decode is
+                // applied by the sampler at texture-fetch time (TextureSign::
+                // kGamma); the bits are identical, so the copy path is the
+                // same.
+                0 | 1 => matches!(
+                    dest_format,
+                    cf::K_8_8_8_8 | cf::K_8_8_8_8_A | cf::K_8_8_8_8_AS_16_16_16_16
+                ),
+                // k_2_10_10_10 (2) and k_2_10_10_10_AS_10_10_10_10 (10).
+                2 | 10 => matches!(
+                    dest_format,
+                    cf::K_2_10_10_10 | cf::K_2_10_10_10_AS_16_16_16_16
+                ),
+                // k_16_16_FLOAT (6).
+                6 => dest_format == cf::K_16_16_FLOAT,
+                // k_32_FLOAT (14).
+                14 => dest_format == cf::K_32_FLOAT,
+                _ => false,
+            }
+        }
+        ResolveSource::Depth => match source_format {
+            // kD24S8 (0) → k_24_8 (22).
+            0 => dest_format == color_format::K_24_8,
+            // kD24FS8 (1) → k_24_8_FLOAT (23).
+            1 => dest_format == color_format::K_24_8_FLOAT,
+            _ => false,
+        },
+    }
+}
+
+/// 64-bit bitwise-equivalence check (Canary `xenos.h:614-639` 64bpp arms).
+/// Used when `info.source_is_64bpp == true`. Only color resolves go here —
+/// depth is always 32bpp.
+pub fn is_64bpp_bitwise_equivalent(source_format: u8, dest_format: u8) -> bool {
+    use color_format as cf;
+    match source_format {
+        // k_16_16_16_16 (5) — signed and unsigned variants resolve to the
+        // same bits because the resolve is a raw byte copy.
+        5 => dest_format == cf::K_16_16_16_16,
+        // k_16_16_16_16_FLOAT (7).
+        7 => dest_format == cf::K_16_16_16_16_FLOAT,
+        // k_32_32_FLOAT (15).
+        15 => dest_format == cf::K_32_32_FLOAT,
+        _ => false,
+    }
+}
+
+/// Run one resolve copy. Returns the number of samples successfully
+/// written and whether the dest format was supported; the caller updates
+/// `GpuStats::resolves_copied_total` / `resolves_skipped_total` accordingly.
+pub fn copy_to_memory(
+    info: &ResolveInfo,
+    edram: &ShadowEdram,
+    mem: &dyn MemoryAccess,
+) -> ResolveCopyStats {
+    // --- No-op paths (not a failure) ---
+    if info.coords.width == 0 || info.coords.height == 0 {
+        return ResolveCopyStats {
+            samples_written: 0,
+            supported: true,
+        };
+    }
+    if info.copy_command == COPY_COMMAND_NULL {
+        return ResolveCopyStats {
+            samples_written: 0,
+            supported: true,
+        };
+    }
+
+    // --- Supported-shape gates ---
+    if info.copy_dest_array {
+        tracing::warn!(
+            src = info.copy_src_select,
+            fmt = info.dest_format,
+            "gpu: resolve skipped — copy_dest_array (3D/stacked) not implemented"
+        );
+        return ResolveCopyStats::default();
+    }
+    if info.dest_exp_bias != 0 {
+        tracing::warn!(
+            bias = info.dest_exp_bias,
+            "gpu: resolve skipped — dest_exp_bias != 0 not implemented"
+        );
+        return ResolveCopyStats::default();
+    }
+    let supported = if info.source_is_64bpp {
+        // 64bpp color resolve. Depth is always 32bpp so this only fires
+        // for `ResolveSource::Color(_)`.
+        matches!(info.source, ResolveSource::Color(_))
+            && is_64bpp_bitwise_equivalent(info.source_format, info.dest_format)
+    } else {
+        is_32bpp_bitwise_equivalent(
+            info.source,
+            info.source_is_64bpp,
+            info.source_format,
+            info.dest_format,
+        )
+    };
+    if !supported {
+        tracing::warn!(
+            source = ?info.source,
+            source_format = info.source_format,
+            source_is_64bpp = info.source_is_64bpp,
+            dest_format = info.dest_format,
+            "gpu: resolve skipped — not a bitwise-equivalent pair"
+        );
+        return ResolveCopyStats::default();
+    }
+
+    if info.dest_endian >= 4 {
+        tracing::warn!(
+            endian = info.dest_endian,
+            "gpu: resolve endian k8in64/k8in128 approximated as k8in32"
+        );
+    }
+
+    // Destination pitch must be aligned to 32 texels per
+    // `kStoragePitchHeightAlignmentBlocks`. `align_pitch_to_macro_tile`
+    // rounds to 32 (it's `MACRO_TILE_WIDTH_LOG2 = 5`).
+    let pitch_aligned = align_pitch_to_macro_tile(info.dest_pitch_pixels);
+    if pitch_aligned == 0 {
+        return ResolveCopyStats {
+            samples_written: 0,
+            supported: true,
+        };
+    }
+    // bpp_log2: 2 for 32bpp, 3 for 64bpp. Drives the `tiled_2d_offset`
+    // stride calculation per Canary `texture_address.h:120-180`.
+    let bpp_log2: u32 = if info.source_is_64bpp { 3 } else { 2 };
+
+    let is_depth = matches!(info.source, ResolveSource::Depth);
+    let sanitized = sanitize_sample_select(info.copy_sample_select, info.msaa, is_depth);
+    // For averaging modes we'd previously fall back to sample 0 + warn.
+    // 3A wires real averaging via `read_pixel_averaged`; single-sample
+    // picks still take the fast path.
+    let single_sample_idx = sanitized.single_sample_index();
+
+    let mut samples_written: u32 = 0;
+    for dy in 0..info.coords.height {
+        let pixel_y = info.coords.y0 + dy;
+        for dx in 0..info.coords.width {
+            let pixel_x = info.coords.x0 + dx;
+            // Destination coordinates are 0-based against `dest_base` — the
+            // base already points at the top-left of the copy rectangle.
+            let dst_off = tiled_2d_offset(dx, dy, pitch_aligned, bpp_log2);
+            let dst_addr = info.dest_base.wrapping_add(dst_off);
+
+            if info.source_is_64bpp {
+                let (lo, hi) = match single_sample_idx {
+                    Some(idx) => {
+                        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
+                        edram.read_sample_64bpp(
+                            info.source_base_tiles,
+                            info.surface_pitch_tiles,
+                            sx,
+                            sy,
+                        )
+                    }
+                    None => read_pixel_averaged_64bpp(edram, info, sanitized, pixel_x, pixel_y),
+                };
+                let lo_swapped = apply_endian_128(lo, info.dest_endian);
+                let hi_swapped = apply_endian_128(hi, info.dest_endian);
+                mem.write_u32(dst_addr, lo_swapped);
+                mem.write_u32(dst_addr.wrapping_add(4), hi_swapped);
+                samples_written += 1;
+            } else {
+                let sample = match single_sample_idx {
+                    Some(idx) => {
+                        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
+                        edram.read_sample_32bpp(
+                            info.source_base_tiles,
+                            info.surface_pitch_tiles,
+                            sx,
+                            sy,
+                        )
+                    }
+                    None => read_pixel_averaged_32bpp(
+                        edram,
+                        info,
+                        sanitized,
+                        pixel_x,
+                        pixel_y,
+                    ),
+                };
+                let swapped = apply_endian_128(sample, info.dest_endian);
+                mem.write_u32(dst_addr, swapped);
+                samples_written += 1;
+            }
+        }
+    }
+
+    ResolveCopyStats {
+        samples_written,
+        supported: true,
+    }
+}
+
+/// Compute the EDRAM sample-space (x, y) for `(pixel_x, pixel_y)` and a
+/// single MSAA sample index.
+#[inline]
+fn sample_xy(
+    pixel_x: u32,
+    pixel_y: u32,
+    sample_idx: u8,
+    msaa: MsaaSamples,
+    coords: &crate::draw_state::ResolveCoordinates,
+) -> (u32, u32) {
+    let (sample_dx, sample_dy) = sample_offset_in_pixel(sample_idx, msaa);
+    let sx = (pixel_x << coords.sample_count_log2_x) + sample_dx;
+    let sy = (pixel_y << coords.sample_count_log2_y) + sample_dy;
+    (sx, sy)
+}
+
+/// Sample indices selected by an averaging `CopySampleSelect`.
+/// `K01 → [0, 1]`, `K23 → [2, 3]`, `K0123 → [0, 1, 2, 3]`. Single-sample
+/// picks should never reach this helper (caller checks `single_sample_index`).
+fn averaging_sample_set(select: CopySampleSelect) -> &'static [u8] {
+    match select {
+        CopySampleSelect::K01 => &[0, 1],
+        CopySampleSelect::K23 => &[2, 3],
+        CopySampleSelect::K0123 => &[0, 1, 2, 3],
+        // Single-sample picks: caller must never invoke this — fall back
+        // to sample 0 just to keep the function total.
+        _ => &[0],
+    }
+}
+
+/// Average N samples of a 32bpp pixel format. Each sample is read, decoded
+/// by `source_format`, averaged in the appropriate numeric space, then
+/// re-encoded back into the same 32bpp word. Mirrors Canary's resolve
+/// shader paths in `resolve.xesli:595-629` (per-format averaging) — we
+/// implement them on the CPU because the resolve runs on the host.
+fn read_pixel_averaged_32bpp(
+    edram: &ShadowEdram,
+    info: &ResolveInfo,
+    select: CopySampleSelect,
+    pixel_x: u32,
+    pixel_y: u32,
+) -> u32 {
+    let indices = averaging_sample_set(select);
+    let n = indices.len() as u32;
+    if n == 0 {
+        return 0;
+    }
+    // Pull every selected sample.
+    let mut raw = [0u32; 4];
+    for (i, &idx) in indices.iter().enumerate() {
+        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
+        raw[i] = edram.read_sample_32bpp(info.source_base_tiles, info.surface_pitch_tiles, sx, sy);
+    }
+    let raw_slice = &raw[..indices.len()];
+    average_samples_32bpp(raw_slice, info.source_format, info.source)
+}
+
+/// Average N samples of a 64bpp pixel format, returning `(lo, hi)`.
+fn read_pixel_averaged_64bpp(
+    edram: &ShadowEdram,
+    info: &ResolveInfo,
+    select: CopySampleSelect,
+    pixel_x: u32,
+    pixel_y: u32,
+) -> (u32, u32) {
+    let indices = averaging_sample_set(select);
+    let n = indices.len();
+    if n == 0 {
+        return (0, 0);
+    }
+    let mut raw = [(0u32, 0u32); 4];
+    for (i, &idx) in indices.iter().enumerate() {
+        let (sx, sy) = sample_xy(pixel_x, pixel_y, idx, info.msaa, &info.coords);
+        raw[i] = edram.read_sample_64bpp(info.source_base_tiles, info.surface_pitch_tiles, sx, sy);
+    }
+    let raw_slice = &raw[..n];
+    average_samples_64bpp(raw_slice, info.source_format)
+}
+
+/// Per-format averaging for 32bpp color/depth resolves.
+fn average_samples_32bpp(samples: &[u32], source_format: u8, source: ResolveSource) -> u32 {
+    let n = samples.len() as u32;
+    debug_assert!(n > 0);
+    match source {
+        ResolveSource::Color(_) => match source_format {
+            // k_8_8_8_8 / k_8_8_8_8_GAMMA (0/1): per-channel rounded
+            // unsigned-int mean. Matches Canary's `resolve.xesli` per-component
+            // average for u8 — gamma is a sampler-time post-decode, the
+            // bits are identical for resolve purposes.
+            0 | 1 => average_8_8_8_8(samples, n),
+            // k_2_10_10_10 / k_2_10_10_10_AS_10_10_10_10: per-field rounded
+            // unsigned-int mean. Field widths 2/10/10/10 from low to high.
+            2 | 10 => average_2_10_10_10(samples, n),
+            // k_16_16_FLOAT (6): two half-floats packed in one u32.
+            6 => average_2_half_floats(samples, n),
+            // k_32_FLOAT (14): one f32 per sample.
+            14 => average_1_f32(samples, n),
+            // For any unsupported format, fall back to first sample —
+            // upstream gating already filtered to bitwise-equivalent pairs
+            // so this branch should be unreachable in practice.
+            _ => samples[0],
+        },
+        // Depth resolves never carry MSAA averaging (sanitize collapses to
+        // single-sample); reaching this branch is a degenerate caller.
+        ResolveSource::Depth => samples[0],
+    }
+}
+
+/// Per-format averaging for 64bpp color resolves. Returns `(lo, hi)`.
+fn average_samples_64bpp(samples: &[(u32, u32)], source_format: u8) -> (u32, u32) {
+    let n = samples.len() as u32;
+    debug_assert!(n > 0);
+    match source_format {
+        // k_16_16_16_16 (5): four 16-bit channels across (lo, hi). Per-
+        // channel rounded unsigned-int mean. Signed/unsigned variants
+        // resolve identically because the resolve is a raw byte copy —
+        // averaging signed values as unsigned still gives the correct
+        // bits because two's-complement addition of `n` values divided
+        // by `n` lands on the same bit pattern after truncation.
+        5 => average_4_u16(samples, n),
+        // k_16_16_16_16_FLOAT (7): four half-floats.
+        7 => average_4_half_floats(samples, n),
+        // k_32_32_FLOAT (15): two f32 (R32 = lo, G32 = hi).
+        15 => average_2_f32(samples, n),
+        _ => samples[0],
+    }
+}
+
+#[inline]
+fn average_8_8_8_8(samples: &[u32], n: u32) -> u32 {
+    // Per-byte rounded unsigned mean.
+    let mut sums = [0u32; 4];
+    for &s in samples {
+        sums[0] += s & 0xFF;
+        sums[1] += (s >> 8) & 0xFF;
+        sums[2] += (s >> 16) & 0xFF;
+        sums[3] += (s >> 24) & 0xFF;
+    }
+    let half = n / 2;
+    let avg = |sum: u32| ((sum + half) / n) & 0xFF;
+    avg(sums[0])
+        | (avg(sums[1]) << 8)
+        | (avg(sums[2]) << 16)
+        | (avg(sums[3]) << 24)
+}
+
+#[inline]
+fn average_2_10_10_10(samples: &[u32], n: u32) -> u32 {
+    // Field widths 2/10/10/10 (low to high).
+    let mut sum_a = 0u32; // 2 bits
+    let mut sum_b = 0u32; // 10 bits
+    let mut sum_g = 0u32; // 10 bits
+    let mut sum_r = 0u32; // 10 bits
+    for &s in samples {
+        sum_a += s & 0x3;
+        sum_b += (s >> 2) & 0x3FF;
+        sum_g += (s >> 12) & 0x3FF;
+        sum_r += (s >> 22) & 0x3FF;
+    }
+    let half = n / 2;
+    let avg = |sum: u32, width: u32| ((sum + half) / n) & ((1u32 << width) - 1);
+    avg(sum_a, 2) | (avg(sum_b, 10) << 2) | (avg(sum_g, 10) << 12) | (avg(sum_r, 10) << 22)
+}
+
+#[inline]
+fn half_to_f32(half: u16) -> f32 {
+    let sign = ((half >> 15) & 0x1) as u32;
+    let exp = ((half >> 10) & 0x1F) as i32;
+    let mant = (half & 0x3FF) as u32;
+    if exp == 0 {
+        if mant == 0 {
+            return f32::from_bits(sign << 31);
+        }
+        // Subnormal half → normalized f32.
+        let mut e = -14;
+        let mut m = mant;
+        while (m & 0x400) == 0 {
+            m <<= 1;
+            e -= 1;
+        }
+        m &= 0x3FF;
+        let f_exp = (e + 127) as u32;
+        return f32::from_bits((sign << 31) | (f_exp << 23) | (m << 13));
+    }
+    if exp == 31 {
+        let f_exp = 0xFFu32;
+        let f_mant = mant << 13;
+        return f32::from_bits((sign << 31) | (f_exp << 23) | f_mant);
+    }
+    let f_exp = (exp - 15 + 127) as u32;
+    f32::from_bits((sign << 31) | (f_exp << 23) | (mant << 13))
+}
+
+#[inline]
+fn f32_to_half(f: f32) -> u16 {
+    let bits = f.to_bits();
+    let sign = ((bits >> 31) & 0x1) as u16;
+    let exp = ((bits >> 23) & 0xFF) as i32;
+    let mant = bits & 0x7FFFFF;
+    if exp == 0xFF {
+        // Inf or NaN.
+        let h_mant = if mant != 0 { 0x200 } else { 0 };
+        return (sign << 15) | (0x1F << 10) | h_mant;
+    }
+    if exp == 0 {
+        return sign << 15;
+    }
+    let e = exp - 127 + 15;
+    if e >= 31 {
+        return (sign << 15) | (0x1F << 10);
+    }
+    if e <= 0 {
+        // Subnormal half. Round-to-nearest-even is overkill; truncate
+        // toward zero — averaging 4 floats then converting once is the
+        // dominant precision path anyway.
+        if e < -10 {
+            return sign << 15;
+        }
+        let m = (mant | 0x800000) >> ((1 - e) as u32 + 13);
+        return (sign << 15) | (m as u16);
+    }
+    let h_mant = (mant >> 13) as u16;
+    (sign << 15) | ((e as u16) << 10) | h_mant
+}
+
+#[inline]
+fn average_2_half_floats(samples: &[u32], n: u32) -> u32 {
+    // Each u32 = (lo: half, hi: half). Average as f32, re-encode.
+    let mut sum_lo = 0.0f32;
+    let mut sum_hi = 0.0f32;
+    for &s in samples {
+        sum_lo += half_to_f32((s & 0xFFFF) as u16);
+        sum_hi += half_to_f32(((s >> 16) & 0xFFFF) as u16);
+    }
+    let inv = 1.0f32 / n as f32;
+    let lo = f32_to_half(sum_lo * inv) as u32;
+    let hi = f32_to_half(sum_hi * inv) as u32;
+    lo | (hi << 16)
+}
+
+#[inline]
+fn average_1_f32(samples: &[u32], n: u32) -> u32 {
+    let mut sum = 0.0f32;
+    for &s in samples {
+        sum += f32::from_bits(s);
+    }
+    (sum / n as f32).to_bits()
+}
+
+#[inline]
+fn average_4_u16(samples: &[(u32, u32)], n: u32) -> (u32, u32) {
+    // (lo, hi) carry 4 × 16-bit channels. lo = (R, G), hi = (B, A) or similar
+    // packing — averaging is per-16-bit-field regardless of channel mapping.
+    let extract = |w: u32, shift: u32| (w >> shift) & 0xFFFF;
+    let mut sums = [0u32; 4];
+    for &(lo, hi) in samples {
+        sums[0] += extract(lo, 0);
+        sums[1] += extract(lo, 16);
+        sums[2] += extract(hi, 0);
+        sums[3] += extract(hi, 16);
+    }
+    let half = n / 2;
+    let avg = |sum: u32| ((sum + half) / n) & 0xFFFF;
+    let lo = avg(sums[0]) | (avg(sums[1]) << 16);
+    let hi = avg(sums[2]) | (avg(sums[3]) << 16);
+    (lo, hi)
+}
+
+#[inline]
+fn average_4_half_floats(samples: &[(u32, u32)], n: u32) -> (u32, u32) {
+    let mut sums = [0.0f32; 4];
+    for &(lo, hi) in samples {
+        sums[0] += half_to_f32((lo & 0xFFFF) as u16);
+        sums[1] += half_to_f32(((lo >> 16) & 0xFFFF) as u16);
+        sums[2] += half_to_f32((hi & 0xFFFF) as u16);
+        sums[3] += half_to_f32(((hi >> 16) & 0xFFFF) as u16);
+    }
+    let inv = 1.0f32 / n as f32;
+    let h0 = f32_to_half(sums[0] * inv) as u32;
+    let h1 = f32_to_half(sums[1] * inv) as u32;
+    let h2 = f32_to_half(sums[2] * inv) as u32;
+    let h3 = f32_to_half(sums[3] * inv) as u32;
+    (h0 | (h1 << 16), h2 | (h3 << 16))
+}
+
+#[inline]
+fn average_2_f32(samples: &[(u32, u32)], n: u32) -> (u32, u32) {
+    let mut sum_lo = 0.0f32;
+    let mut sum_hi = 0.0f32;
+    for &(lo, hi) in samples {
+        sum_lo += f32::from_bits(lo);
+        sum_hi += f32::from_bits(hi);
+    }
+    let inv = 1.0f32 / n as f32;
+    ((sum_lo * inv).to_bits(), (sum_hi * inv).to_bits())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::draw_state::{ResolveCoordinates, ResolveInfo};
+    use crate::edram::ShadowEdram;
+    use crate::render_target_cache::MsaaSamples;
+    use crate::tiled_address::{align_pitch_to_macro_tile, tiled_2d_offset};
+    use xenia_memory::GuestMemory;
+
+    /// Build a minimally-populated [`ResolveInfo`] for tests.
+    fn minimal_info(dest_base: u32, pitch: u32, height: u32) -> ResolveInfo {
+        ResolveInfo {
+            copy_src_select: 0,
+            copy_sample_select: 0,
+            color_clear_enable: false,
+            depth_clear_enable: false,
+            copy_command: 0,
+            dest_base,
+            dest_pitch_pixels: pitch,
+            dest_height_pixels: height,
+            dest_format: color_format::K_8_8_8_8,
+            dest_endian: 0,
+            dest_exp_bias: 0,
+            source: ResolveSource::Color(0),
+            coords: ResolveCoordinates {
+                x0: 0,
+                y0: 0,
+                width: pitch,
+                height,
+                sample_count_log2_x: 0,
+                sample_count_log2_y: 0,
+            },
+            source_format: 0,
+            source_base_tiles: 0,
+            surface_pitch_tiles: pitch.div_ceil(80),
+            msaa: MsaaSamples::X1,
+            source_is_64bpp: false,
+            color_clear_value: 0,
+            color_clear_value_lo: 0,
+            depth_clear_value: 0,
+            copy_dest_array: false,
+        }
+    }
+
+    fn fresh_mem() -> GuestMemory {
+        use xenia_memory::page_table::MemoryProtect;
+        let mut mem = GuestMemory::new().expect("guest memory");
+        mem.alloc(
+            0x4000_0000,
+            0x0010_0000,
+            MemoryProtect::READ | MemoryProtect::WRITE,
+        )
+        .expect("alloc");
+        mem
+    }
+
+    #[test]
+    fn endian_k_none_is_identity() {
+        assert_eq!(apply_endian_128(0x11223344, 0), 0x11223344);
+    }
+
+    #[test]
+    fn endian_k8in16_swaps_byte_pairs() {
+        assert_eq!(apply_endian_128(0x11223344, 1), 0x22114433);
+    }
+
+    #[test]
+    fn endian_k8in32_is_full_byte_reverse() {
+        assert_eq!(apply_endian_128(0x11223344, 2), 0x44332211);
+    }
+
+    #[test]
+    fn endian_k16in32_swaps_halves() {
+        assert_eq!(apply_endian_128(0x11223344, 3), 0x33441122);
+    }
+
+    #[test]
+    fn color_clear_resolve_writes_le_bytes_with_k8in32() {
+        // Clear-resolve a 32x8 rectangle of k_8_8_8_8 samples to pattern
+        // 0x11223344 with endian k8in32. Memory should contain LE bytes
+        // [0x44, 0x33, 0x22, 0x11] at every tiled sample offset.
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        edram.fill_rect_32bpp(0, 1, 0, 0, 32, 8, 0x11223344);
+
+        let mut info = minimal_info(0x4000_0000, 32, 8);
+        info.dest_endian = 2; // k8in32
+        info.source_base_tiles = 0;
+        info.surface_pitch_tiles = 1;
+
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        assert_eq!(stats.samples_written, 32 * 8);
+
+        let pitch_aligned = align_pitch_to_macro_tile(32);
+        for y in 0..8u32 {
+            for x in 0..32u32 {
+                let off = tiled_2d_offset(x, y, pitch_aligned, 2);
+                let addr = 0x4000_0000u32.wrapping_add(off);
+                let bytes = [
+                    mem.read_u8(addr),
+                    mem.read_u8(addr.wrapping_add(1)),
+                    mem.read_u8(addr.wrapping_add(2)),
+                    mem.read_u8(addr.wrapping_add(3)),
+                ];
+                assert_eq!(
+                    bytes,
+                    [0x44, 0x33, 0x22, 0x11],
+                    "mismatch at ({x}, {y})"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn k_none_endian_keeps_big_endian_bytes() {
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        edram.fill_rect_32bpp(0, 1, 0, 0, 16, 8, 0xAABBCCDD);
+
+        let mut info = minimal_info(0x4000_0000, 16, 8);
+        info.dest_endian = 0;
+        info.source_base_tiles = 0;
+        info.surface_pitch_tiles = 1;
+
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+
+        let pitch_aligned = align_pitch_to_macro_tile(16);
+        let off = tiled_2d_offset(0, 0, pitch_aligned, 2);
+        let addr = 0x4000_0000u32.wrapping_add(off);
+        assert_eq!(
+            [
+                mem.read_u8(addr),
+                mem.read_u8(addr.wrapping_add(1)),
+                mem.read_u8(addr.wrapping_add(2)),
+                mem.read_u8(addr.wrapping_add(3)),
+            ],
+            [0xAA, 0xBB, 0xCC, 0xDD]
+        );
+    }
+
+    #[test]
+    fn empty_rect_is_noop_and_no_page_version_bump() {
+        let mut mem = fresh_mem();
+        let edram = ShadowEdram::new();
+        let before = mem.page_version(0x4000_0000);
+
+        let mut info = minimal_info(0x4000_0000, 0, 0);
+        info.coords.width = 0;
+        info.coords.height = 0;
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        assert_eq!(stats.samples_written, 0);
+        assert_eq!(mem.page_version(0x4000_0000), before);
+    }
+
+    #[test]
+    fn unsupported_dest_format_is_graceful() {
+        let mut mem = fresh_mem();
+        let edram = ShadowEdram::new();
+        let mut info = minimal_info(0x4000_0000, 16, 16);
+        // k_16_16_16_16 is 64bpp — not bitwise-equivalent to any 32bpp dest.
+        info.dest_format = 26;
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(!stats.supported);
+        assert_eq!(stats.samples_written, 0);
+    }
+
+    #[test]
+    fn resolve_bumps_page_version_for_texture_cache_invalidation() {
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        edram.fill_rect_32bpp(0, 1, 0, 0, 16, 8, 0xDEADBEEF);
+
+        let before = mem.page_version(0x4000_0000);
+        let mut info = minimal_info(0x4000_0000, 16, 8);
+        info.source_base_tiles = 0;
+        info.surface_pitch_tiles = 1;
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        assert!(mem.page_version(0x4000_0000) > before);
+    }
+
+    /// k_2_10_10_10 source ↔ k_2_10_10_10 dest is bitwise-equivalent per
+    /// Canary `xenos.h:624-627`. Same path, just different format bytes.
+    #[test]
+    fn k_2_10_10_10_is_bitwise_equivalent() {
+        assert!(is_32bpp_bitwise_equivalent(
+            ResolveSource::Color(0), false, /* source */ 2, /* dest */ 7,
+        ));
+        assert!(is_32bpp_bitwise_equivalent(
+            ResolveSource::Color(0),
+            false,
+            /* source k_2_10_10_10_AS_10_10_10_10 */ 10,
+            /* dest k_2_10_10_10_AS_16_16_16_16 */ 54,
+        ));
+    }
+
+    /// k_8_8_8_8_GAMMA source resolves identically to k_8_8_8_8 (gamma is
+    /// applied at sample time, not on store).
+    #[test]
+    fn k_8_8_8_8_gamma_source_is_bitwise_equivalent() {
+        assert!(is_32bpp_bitwise_equivalent(
+            ResolveSource::Color(0),
+            false,
+            /* source k_8_8_8_8_GAMMA */ 1,
+            /* dest k_8_8_8_8 */ 6,
+        ));
+    }
+
+    /// Depth resolve: kD24S8 → k_24_8, kD24FS8 → k_24_8_FLOAT.
+    #[test]
+    fn depth_resolve_format_equivalence() {
+        assert!(is_32bpp_bitwise_equivalent(
+            ResolveSource::Depth,
+            false,
+            /* kD24S8 */ 0,
+            /* k_24_8 */ 22,
+        ));
+        assert!(is_32bpp_bitwise_equivalent(
+            ResolveSource::Depth,
+            false,
+            /* kD24FS8 */ 1,
+            /* k_24_8_FLOAT */ 23,
+        ));
+        // Mismatched depth → texture format = not equivalent.
+        assert!(!is_32bpp_bitwise_equivalent(
+            ResolveSource::Depth,
+            false,
+            0,
+            23,
+        ));
+    }
+
+    /// 64bpp source is never equivalent to a 32bpp dest, even when the
+    /// source/dest format numbers might look compatible.
+    #[test]
+    fn sixty_four_bpp_source_is_never_equivalent() {
+        assert!(!is_32bpp_bitwise_equivalent(
+            ResolveSource::Color(0),
+            true,
+            5, // k_16_16_16_16
+            6,
+        ));
+    }
+
+    /// 64bpp bitwise-equivalent pairs per Canary `xenos.h:614-639`.
+    #[test]
+    fn sixty_four_bpp_equivalence_pairs() {
+        // k_16_16_16_16 (5) → k_16_16_16_16 (26)
+        assert!(is_64bpp_bitwise_equivalent(5, 26));
+        // k_16_16_16_16_FLOAT (7) → k_16_16_16_16_FLOAT (32)
+        assert!(is_64bpp_bitwise_equivalent(7, 32));
+        // k_32_32_FLOAT (15) → k_32_32_FLOAT (37)
+        assert!(is_64bpp_bitwise_equivalent(15, 37));
+        // Cross-format must reject.
+        assert!(!is_64bpp_bitwise_equivalent(5, 32));
+        assert!(!is_64bpp_bitwise_equivalent(0, 26));
+    }
+
+    /// End-to-end 64bpp resolve: paint a `k_16_16_16_16` pattern into EDRAM
+    /// and confirm `copy_to_memory` lands two u32s per pixel into guest mem.
+    #[test]
+    fn sixty_four_bpp_resolve_writes_two_words_per_pixel() {
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        // 16x4 logical 64bpp samples; pitch = 1 32bpp tile.
+        edram.fill_rect_64bpp(0, 1, 0, 0, 16, 4, 0xAABB_CCDD, 0x1122_3344);
+
+        let mut info = minimal_info(0x4000_0000, 16, 4);
+        info.source = ResolveSource::Color(0);
+        info.source_format = 5; // k_16_16_16_16
+        info.dest_format = color_format::K_16_16_16_16;
+        info.source_is_64bpp = true;
+        info.dest_endian = 0; // kNone
+        info.source_base_tiles = 0;
+        info.surface_pitch_tiles = 1;
+        info.coords.width = 16;
+        info.coords.height = 4;
+
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        assert_eq!(stats.samples_written, 16 * 4);
+
+        // First pixel: lo word at dst_off, hi word at dst_off + 4. With
+        // bpp_log2=3, pitch_aligned=32 (rounded from 16), tiled offset
+        // for (0,0) is 0.
+        let pitch_aligned = align_pitch_to_macro_tile(16);
+        let off = tiled_2d_offset(0, 0, pitch_aligned, 3);
+        let addr = 0x4000_0000u32.wrapping_add(off);
+        // BE store of 0xAABBCCDD = bytes [0xAA, 0xBB, 0xCC, 0xDD]
+        assert_eq!(mem.read_u8(addr), 0xAA);
+        assert_eq!(mem.read_u8(addr.wrapping_add(1)), 0xBB);
+        assert_eq!(mem.read_u8(addr.wrapping_add(2)), 0xCC);
+        assert_eq!(mem.read_u8(addr.wrapping_add(3)), 0xDD);
+        assert_eq!(mem.read_u8(addr.wrapping_add(4)), 0x11);
+        assert_eq!(mem.read_u8(addr.wrapping_add(7)), 0x44);
+    }
+
+    /// MSAA averaging — `k_8_8_8_8` per-channel rounded mean of 4 samples.
+    /// Build a 4x MSAA RT where the 4 samples per pixel hold (0, 64, 128,
+    /// 192) in the red channel and check the resolve produces the rounded
+    /// mean (96).
+    #[test]
+    fn msaa_4x_averaging_k_8_8_8_8() {
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        // 4x MSAA: each pixel occupies a 2×2 sample grid.
+        // Pixel (0,0) sample positions (0..4) at sample-coords:
+        //   s0: (0, 0)
+        //   s1: (1, 0)
+        //   s2: (0, 1)
+        //   s3: (1, 1)
+        // Stuff R=[0, 64, 128, 192], G=B=A=0.
+        edram.write_sample_32bpp(0, 1, 0, 0, 0x00_00_00_00); // R=0
+        edram.write_sample_32bpp(0, 1, 1, 0, 0x00_00_00_40); // R=64
+        edram.write_sample_32bpp(0, 1, 0, 1, 0x00_00_00_80); // R=128
+        edram.write_sample_32bpp(0, 1, 1, 1, 0x00_00_00_C0); // R=192
+
+        let mut info = minimal_info(0x4000_0000, 1, 1);
+        info.source = ResolveSource::Color(0);
+        info.source_format = 0; // k_8_8_8_8
+        info.dest_format = color_format::K_8_8_8_8;
+        info.copy_sample_select = 6; // K0123
+        info.msaa = MsaaSamples::X4;
+        info.coords.sample_count_log2_x = 1;
+        info.coords.sample_count_log2_y = 1;
+        info.coords.width = 1;
+        info.coords.height = 1;
+        info.dest_endian = 0;
+        info.source_base_tiles = 0;
+        info.surface_pitch_tiles = 1;
+
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        assert_eq!(stats.samples_written, 1);
+        // R = (0+64+128+192 + 2)/4 = 96 = 0x60. Big-endian store.
+        let addr = 0x4000_0000u32;
+        // The byte order in u32 is [byte0, byte1, byte2, byte3] where
+        // byte0 = R. After BE store of pixel 0x000000_60 (R=0x60), the
+        // bytes at the resolve-tile offset are [0x00, 0x00, 0x00, 0x60].
+        let bytes = [
+            mem.read_u8(addr),
+            mem.read_u8(addr.wrapping_add(1)),
+            mem.read_u8(addr.wrapping_add(2)),
+            mem.read_u8(addr.wrapping_add(3)),
+        ];
+        assert_eq!(bytes, [0x00, 0x00, 0x00, 0x60], "averaged R should be 0x60");
+    }
+
+    /// MSAA averaging — `k_32_FLOAT` averages 4 f32 samples linearly.
+    #[test]
+    fn msaa_4x_averaging_k_32_float() {
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        let f = |v: f32| v.to_bits();
+        edram.write_sample_32bpp(0, 1, 0, 0, f(1.0));
+        edram.write_sample_32bpp(0, 1, 1, 0, f(2.0));
+        edram.write_sample_32bpp(0, 1, 0, 1, f(3.0));
+        edram.write_sample_32bpp(0, 1, 1, 1, f(4.0));
+
+        let mut info = minimal_info(0x4000_0000, 1, 1);
+        info.source = ResolveSource::Color(0);
+        info.source_format = 14; // k_32_FLOAT
+        info.dest_format = color_format::K_32_FLOAT;
+        info.copy_sample_select = 6; // K0123
+        info.msaa = MsaaSamples::X4;
+        info.coords.sample_count_log2_x = 1;
+        info.coords.sample_count_log2_y = 1;
+        info.coords.width = 1;
+        info.coords.height = 1;
+        info.dest_endian = 2; // k8in32 — game-typical for float sampling
+        info.source_base_tiles = 0;
+        info.surface_pitch_tiles = 1;
+
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        // (1+2+3+4)/4 = 2.5
+        let expected = 2.5f32.to_bits();
+        // k8in32 swap = byte-reverse → BE store puts the LE-swapped bytes back
+        // in original (big-endian) order. Reconstruct guest-visible u32:
+        let bytes = [
+            mem.read_u8(0x4000_0000),
+            mem.read_u8(0x4000_0001),
+            mem.read_u8(0x4000_0002),
+            mem.read_u8(0x4000_0003),
+        ];
+        // After endian k8in32 (swap_bytes) and BE store, the bytes in memory
+        // are LE-from-CPU-perspective. So bytes here are u32::to_le_bytes(expected).
+        assert_eq!(bytes, expected.to_le_bytes());
+    }
+
+    /// MSAA averaging — `k_2_10_10_10` per-field rounded mean.
+    #[test]
+    fn msaa_2x_averaging_k_2_10_10_10() {
+        // 2x MSAA samples are stacked vertically (s0 at y=0, s1 at y=1).
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        // Field widths 2/10/10/10. Pack two values per field (a/b/g/r).
+        let pack = |a: u32, b: u32, g: u32, r: u32| {
+            (a & 0x3) | ((b & 0x3FF) << 2) | ((g & 0x3FF) << 12) | ((r & 0x3FF) << 22)
+        };
+        edram.write_sample_32bpp(0, 1, 0, 0, pack(0, 100, 200, 300));
+        edram.write_sample_32bpp(0, 1, 0, 1, pack(2, 200, 300, 400));
+
+        let mut info = minimal_info(0x4000_0000, 1, 1);
+        info.source = ResolveSource::Color(0);
+        info.source_format = 2; // k_2_10_10_10
+        info.dest_format = color_format::K_2_10_10_10;
+        info.copy_sample_select = 4; // K01
+        info.msaa = MsaaSamples::X2;
+        info.coords.sample_count_log2_x = 0;
+        info.coords.sample_count_log2_y = 1;
+        info.coords.width = 1;
+        info.coords.height = 1;
+        info.dest_endian = 0;
+        info.source_base_tiles = 0;
+        info.surface_pitch_tiles = 1;
+
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        // Expected per-field: a=(0+2+1)/2=1, b=(100+200+1)/2=150, g=(200+300+1)/2=250, r=(300+400+1)/2=350
+        let expected = pack(1, 150, 250, 350);
+        // Read back as BE u32 (big-endian byte ordering).
+        let bytes = [
+            mem.read_u8(0x4000_0000),
+            mem.read_u8(0x4000_0001),
+            mem.read_u8(0x4000_0002),
+            mem.read_u8(0x4000_0003),
+        ];
+        assert_eq!(bytes, expected.to_be_bytes());
+    }
+
+    /// End-to-end depth resolve: set up a depth RT at tile base 8, paint
+    /// it via clear value, and verify the copy emerges in guest memory
+    /// with the right bytes.
+    #[test]
+    fn depth_clear_resolve_end_to_end() {
+        let mut mem = fresh_mem();
+        let mut edram = ShadowEdram::new();
+        // Paint the depth tiles directly with a known pattern.
+        edram.fill_rect_32bpp(8, 1, 0, 0, 16, 8, 0x3FFF_FF00);
+
+        let mut info = minimal_info(0x4000_0000, 16, 8);
+        info.source = ResolveSource::Depth;
+        info.source_format = 0; // kD24S8
+        info.dest_format = color_format::K_24_8;
+        info.dest_endian = 2; // k8in32
+        info.source_base_tiles = 8;
+        info.surface_pitch_tiles = 1;
+
+        let stats = copy_to_memory(&info, &edram, &mut mem);
+        assert!(stats.supported);
+        assert_eq!(stats.samples_written, 16 * 8);
+
+        // First pixel should be the endian-swapped pattern: BE-store of
+        // 0x3FFF_FF00.swap_bytes() = 0x00FF_FF3F → bytes [0x00, 0xFF, 0xFF, 0x3F].
+        let pitch_aligned = align_pitch_to_macro_tile(16);
+        let off = tiled_2d_offset(0, 0, pitch_aligned, 2);
+        let addr = 0x4000_0000u32.wrapping_add(off);
+        assert_eq!(
+            [
+                mem.read_u8(addr),
+                mem.read_u8(addr.wrapping_add(1)),
+                mem.read_u8(addr.wrapping_add(2)),
+                mem.read_u8(addr.wrapping_add(3)),
+            ],
+            [0x00, 0xFF, 0xFF, 0x3F]
+        );
+    }
+
+    /// `sanitize_sample_select` for 1x MSAA collapses every select to K0.
+    #[test]
+    fn sanitize_1x_msaa_collapses_to_k0() {
+        for raw in 0..=7u8 {
+            let s = sanitize_sample_select(raw, MsaaSamples::X1, false);
+            assert_eq!(s, CopySampleSelect::K0, "raw={raw}");
+        }
+    }
+
+    /// 2x MSAA: k2→k0, k3→k1, k23→k01; depth averages sanitize to k0.
+    #[test]
+    fn sanitize_2x_msaa_obeys_canary_rules() {
+        assert_eq!(
+            sanitize_sample_select(2, MsaaSamples::X2, false),
+            CopySampleSelect::K0
+        );
+        assert_eq!(
+            sanitize_sample_select(3, MsaaSamples::X2, false),
+            CopySampleSelect::K1
+        );
+        assert_eq!(
+            sanitize_sample_select(5, MsaaSamples::X2, false),
+            CopySampleSelect::K01
+        );
+        // Depth — no averaging.
+        assert_eq!(
+            sanitize_sample_select(4, MsaaSamples::X2, true),
+            CopySampleSelect::K0
+        );
+        assert_eq!(
+            sanitize_sample_select(6, MsaaSamples::X2, true),
+            CopySampleSelect::K0
+        );
+    }
+
+    /// 4x MSAA: single-samples untouched for color; depth averages
+    /// collapse to a representative single sample (k0123 → k0).
+    #[test]
+    fn sanitize_4x_msaa_depth_collapses_averages() {
+        assert_eq!(
+            sanitize_sample_select(6, MsaaSamples::X4, true),
+            CopySampleSelect::K0
+        );
+        assert_eq!(
+            sanitize_sample_select(5, MsaaSamples::X4, true),
+            CopySampleSelect::K2
+        );
+        assert_eq!(
+            sanitize_sample_select(4, MsaaSamples::X4, true),
+            CopySampleSelect::K0
+        );
+        // Color keeps averages.
+        assert_eq!(
+            sanitize_sample_select(6, MsaaSamples::X4, false),
+            CopySampleSelect::K0123
+        );
+    }
+
+    /// Sample offsets follow the standard Xbox 360 MSAA layout.
+    #[test]
+    fn sample_offset_layout() {
+        // 1x
+        assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X1), (0, 0));
+        // 2x
+        assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X2), (0, 0));
+        assert_eq!(sample_offset_in_pixel(1, MsaaSamples::X2), (0, 1));
+        // 4x
+        assert_eq!(sample_offset_in_pixel(0, MsaaSamples::X4), (0, 0));
+        assert_eq!(sample_offset_in_pixel(1, MsaaSamples::X4), (1, 0));
+        assert_eq!(sample_offset_in_pixel(2, MsaaSamples::X4), (0, 1));
+        assert_eq!(sample_offset_in_pixel(3, MsaaSamples::X4), (1, 1));
+    }
+}
diff --git a/crates/xenia-gpu/src/ring_drain.rs b/crates/xenia-gpu/src/ring_drain.rs
new file mode 100644
index 0000000..2bf4a67
--- /dev/null
+++ b/crates/xenia-gpu/src/ring_drain.rs
@@ -0,0 +1,169 @@
+//! Ring-buffer drainer.
+//!
+//! Walks a guest PM4 ring buffer from `start_offset` forward, classifying each
+//! packet via [`crate::pm4`] and stopping when it either reaches the end of
+//! the window it was asked to scan, walks off a NOP-fill region, or hits a
+//! malformed header.
+//!
+//! It does **not** execute draws — that's deferred to a later phase. Its job
+//! is to (a) advance the read pointer far enough that games keep making
+//! progress, and (b) surface `PM4_XE_SWAP` packets so `VdSwap` can forward
+//! them to the host UI.
+
+use xenia_memory::MemoryAccess;
+
+use crate::pm4::{self, PacketKind};
+
+/// Outcome of a [`drain`] call.
+#[derive(Default, Debug, Clone, Copy)]
+pub struct DrainResult {
+    /// Dword offset reached, relative to the start of the ring buffer.
+    pub new_offset: u32,
+    /// How many packets were walked in this call.
+    pub packets_walked: u32,
+    /// True if we saw `PM4_XE_SWAP` during the walk.
+    pub swap_seen: bool,
+    /// If `swap_seen`, the guest frontbuffer *physical* address written next
+    /// to `PM4_XE_SWAP` (dword 2 of the 4-payload packet).
+    pub swap_frontbuffer_phys: u32,
+    /// If `swap_seen`, the width written at dword 3.
+    pub swap_width: u32,
+    /// If `swap_seen`, the height written at dword 4.
+    pub swap_height: u32,
+}
+
+/// Walk `max_packets` packets starting at dword offset `start_offset` in the
+/// ring buffer at guest address `ring_base` of size `ring_size_dwords`.
+///
+/// The offset is treated modulo `ring_size_dwords`. Walking stops when:
+/// - `max_packets` have been walked,
+/// - a `PM4_XE_SWAP` has been consumed (the swap is reported and we stop so
+///   the UI sees the frame boundary before further drain),
+/// - a header's declared total size would exceed the remaining budget,
+/// - the ring size is zero (drainer is a no-op).
+pub fn drain<M: MemoryAccess + ?Sized>(
+    mem: &M,
+    ring_base: u32,
+    ring_size_dwords: u32,
+    start_offset: u32,
+    max_packets: u32,
+) -> DrainResult {
+    if ring_size_dwords == 0 || ring_base == 0 {
+        return DrainResult::default();
+    }
+    let mut result = DrainResult {
+        new_offset: start_offset % ring_size_dwords,
+        ..DrainResult::default()
+    };
+    let mut offset = result.new_offset;
+    for _ in 0..max_packets {
+        let header_addr = ring_base.wrapping_add(offset.wrapping_mul(4));
+        let header = mem.read_u32(header_addr);
+        let packet = pm4::decode(header);
+        // Refuse to walk past the ring in a single packet.
+        if packet.total_dwords > ring_size_dwords {
+            break;
+        }
+        // Type-3 PM4_XE_SWAP → record payload and stop.
+        if let PacketKind::Type3 { opcode, .. } = packet.kind
+            && opcode == pm4::PM4_XE_SWAP {
+                // Payload layout (from canary VdSwap_entry):
+                //   [0] XE_SWAP header
+                //   [1] kSwapSignature ("XNEX" = 0x584E4558)
+                //   [2] frontbuffer physical address
+                //   [3] width
+                //   [4] height
+                let payload = |i: u32| {
+                    let addr =
+                        ring_base.wrapping_add(((offset + i) % ring_size_dwords).wrapping_mul(4));
+                    mem.read_u32(addr)
+                };
+                result.swap_seen = true;
+                result.swap_frontbuffer_phys = payload(2);
+                result.swap_width = payload(3);
+                result.swap_height = payload(4);
+                offset = (offset + packet.total_dwords) % ring_size_dwords;
+                result.new_offset = offset;
+                result.packets_walked += 1;
+                return result;
+            }
+        offset = (offset + packet.total_dwords) % ring_size_dwords;
+        result.new_offset = offset;
+        result.packets_walked += 1;
+    }
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use xenia_memory::GuestMemory;
+    use xenia_memory::page_table::MemoryProtect;
+
+    fn build_mem() -> GuestMemory {
+        let mut mem = GuestMemory::new().unwrap();
+        let rw = MemoryProtect::READ | MemoryProtect::WRITE;
+        mem.alloc(0x4000_0000, 0x1000, rw).unwrap();
+        mem
+    }
+
+    fn write_dword(mem: &GuestMemory, addr: u32, val: u32) {
+        mem.write_u32(addr, val);
+    }
+
+    #[test]
+    fn walks_nops_until_budget_exhausted() {
+        let mut mem = build_mem();
+        // Fill 10 dwords with Type-2 NOPs.
+        for i in 0..10 {
+            write_dword(&mut mem, 0x4000_0000 + i * 4, 0x8000_0000);
+        }
+        let r = drain(&mem, 0x4000_0000, 0x400, 0, 5);
+        assert_eq!(r.packets_walked, 5);
+        assert_eq!(r.new_offset, 5);
+        assert!(!r.swap_seen);
+    }
+
+    #[test]
+    fn stops_at_swap_and_reports_payload() {
+        let mut mem = build_mem();
+        // Two NOPs, then a PM4_XE_SWAP packet.
+        write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
+        write_dword(&mut mem, 0x4000_0004, 0x8000_0000);
+        // MakePacketType3(PM4_XE_SWAP, 4) → (3<<30) | (3<<16) | (0x64<<8)
+        let swap_hdr = (3u32 << 30) | (3u32 << 16) | ((pm4::PM4_XE_SWAP as u32) << 8);
+        write_dword(&mut mem, 0x4000_0008, swap_hdr);
+        write_dword(&mut mem, 0x4000_000C, pm4::SWAP_SIGNATURE);
+        write_dword(&mut mem, 0x4000_0010, 0xDEAD_F000); // frontbuffer phys
+        write_dword(&mut mem, 0x4000_0014, 1280);
+        write_dword(&mut mem, 0x4000_0018, 720);
+        let r = drain(&mem, 0x4000_0000, 0x400, 0, 16);
+        assert!(r.swap_seen);
+        assert_eq!(r.swap_frontbuffer_phys, 0xDEAD_F000);
+        assert_eq!(r.swap_width, 1280);
+        assert_eq!(r.swap_height, 720);
+        assert_eq!(r.packets_walked, 3);
+        assert_eq!(r.new_offset, 7); // 2 NOPs (1 dword each) + 5-dword swap = 7
+    }
+
+    #[test]
+    fn wraps_around_ring() {
+        let mut mem = build_mem();
+        // Ring size = 4 dwords. Start at offset 3 (last dword). Write a NOP
+        // there, then the walker should wrap to offset 0.
+        write_dword(&mut mem, 0x4000_000C, 0x8000_0000);
+        write_dword(&mut mem, 0x4000_0000, 0x8000_0000);
+        let r = drain(&mem, 0x4000_0000, 4, 3, 2);
+        assert_eq!(r.packets_walked, 2);
+        assert_eq!(r.new_offset, 1);
+    }
+
+    #[test]
+    fn zero_ring_size_is_noop() {
+        let mem = build_mem();
+        let r = drain(&mem, 0x4000_0000, 0, 0, 10);
+        assert_eq!(r.packets_walked, 0);
+        assert_eq!(r.new_offset, 0);
+        assert!(!r.swap_seen);
+    }
+}
diff --git a/crates/xenia-gpu/src/ring_view.rs b/crates/xenia-gpu/src/ring_view.rs
new file mode 100644
index 0000000..a164d73
--- /dev/null
+++ b/crates/xenia-gpu/src/ring_view.rs
@@ -0,0 +1,123 @@
+//! Primary ring buffer view.
+//!
+//! Games allocate a ring buffer in physical memory (via
+//! `MmAllocatePhysicalMemoryEx` with WRITE_COMBINE), then hand the base
+//! address + log2(size) to `VdInitializeRingBuffer`. They subsequently push
+//! PM4 packets into it, advancing the write-pointer by writing to a GPU
+//! register (`CP_RB_WPTR`) or via kernel-call shims.
+//!
+//! The GPU consumes packets from `read_offset_dwords` up to (but not past)
+//! the write pointer. After consuming enough bytes it writes `read_offset`
+//! into the guest-memory address registered by `VdEnableRingBufferRPtrWriteBack`
+//! so the game can know how much of the ring has been consumed.
+
+/// Tracks the primary ring buffer as set up by the guest.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct RingBufferView {
+    /// Guest physical/virtual base address. `0` means uninitialized.
+    pub base: u32,
+    /// Size of the ring in dwords. `0` means uninitialized.
+    pub size_dwords: u32,
+    /// Dword offset the GPU has consumed up to (relative to `base`).
+    pub read_offset_dwords: u32,
+    /// Dword offset the guest has last written into (relative to `base`).
+    /// Updated either by an MMIO write to `CP_RB_WPTR` or by the kernel
+    /// (`VdSwap` is a hint — the game reserves a 64-dword slot in the ring
+    /// for it).
+    pub write_offset_dwords: u32,
+    /// Guest address where we mirror `read_offset_dwords` each time we make
+    /// progress. `0` if the game never called `VdEnableRingBufferRPtrWriteBack`.
+    pub rptr_writeback_addr: u32,
+    /// Write-back block granularity in dwords (from the `log2` arg to
+    /// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so
+    /// we don't actually use this for scheduling — kept for observability.
+    pub rptr_writeback_block_dwords: u32,
+}
+
+impl RingBufferView {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// True if the guest has provided a base + size.
+    pub fn is_initialized(&self) -> bool {
+        self.base != 0 && self.size_dwords != 0
+    }
+
+    /// True if there is pending unread data to consume.
+    pub fn has_pending(&self) -> bool {
+        self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords
+    }
+
+    /// Number of dwords we can consume without wrapping past the write ptr.
+    pub fn pending_dwords(&self) -> u32 {
+        if !self.is_initialized() {
+            return 0;
+        }
+        if self.write_offset_dwords >= self.read_offset_dwords {
+            self.write_offset_dwords - self.read_offset_dwords
+        } else {
+            // write has wrapped — we can read up to the end of the ring.
+            self.size_dwords - self.read_offset_dwords
+        }
+    }
+
+    /// Advance the read pointer by `dwords`, wrapping at `size_dwords`.
+    pub fn advance_read(&mut self, dwords: u32) {
+        if self.size_dwords == 0 {
+            return;
+        }
+        self.read_offset_dwords =
+            (self.read_offset_dwords + dwords) % self.size_dwords;
+    }
+
+    /// Guest address for the dword at relative offset `i` from the current
+    /// read pointer. `None` if uninitialized.
+    pub fn addr_at_offset(&self, offset_dwords: u32) -> Option<u32> {
+        if !self.is_initialized() {
+            return None;
+        }
+        let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords;
+        Some(self.base.wrapping_add(off.wrapping_mul(4)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn uninitialized_view_reports_empty() {
+        let v = RingBufferView::new();
+        assert!(!v.is_initialized());
+        assert!(!v.has_pending());
+        assert_eq!(v.pending_dwords(), 0);
+    }
+
+    #[test]
+    fn wrap_around_arithmetic() {
+        let mut v = RingBufferView::new();
+        v.base = 0x4000_0000;
+        v.size_dwords = 16;
+        v.read_offset_dwords = 14;
+        v.write_offset_dwords = 2; // wrapped
+
+        // We can only read to end-of-ring in one chunk.
+        assert_eq!(v.pending_dwords(), 2);
+        v.advance_read(2);
+        assert_eq!(v.read_offset_dwords, 0);
+        // Now unwrapped, 2 more to go.
+        assert_eq!(v.pending_dwords(), 2);
+    }
+
+    #[test]
+    fn addr_at_offset_wraps() {
+        let mut v = RingBufferView::new();
+        v.base = 0x4000_0000;
+        v.size_dwords = 4;
+        v.read_offset_dwords = 3;
+        assert_eq!(v.addr_at_offset(0), Some(0x4000_000C));
+        assert_eq!(v.addr_at_offset(1), Some(0x4000_0000));
+        assert_eq!(v.addr_at_offset(2), Some(0x4000_0004));
+    }
+}
diff --git a/crates/xenia-gpu/src/shader_metrics.rs b/crates/xenia-gpu/src/shader_metrics.rs
new file mode 100644
index 0000000..7aabe5e
--- /dev/null
+++ b/crates/xenia-gpu/src/shader_metrics.rs
@@ -0,0 +1,350 @@
+//! Host-side static analysis over a [`ParsedShader`], emitted once per unique
+//! shader blob. Produces the observability the plan's P3b/P3c sections call
+//! for (`gpu.shader.interpret{stage,kind}` + `gpu.shader.reject{reason}`), so
+//! the HUD can show when a game is reaching ops the WGSL interpreter falls
+//! back on.
+//!
+//! Analysis is intentionally cheap: it scans each exec clause's instruction
+//! triples, classifies them as ALU / vertex-fetch / texture-fetch using the
+//! owning clause's sequence bitmap, and bumps counters accordingly. No GPU
+//! readback is required — `reject` reasons are inferred from opcode values
+//! alone.
+
+use metrics::counter;
+
+use crate::ucode::alu::{decode_alu, sop, vop};
+use crate::ucode::control_flow::ControlFlowInstruction;
+use crate::ucode::fetch::{FetchInstruction, decode_fetch};
+use crate::ucode::ParsedShader;
+
+/// Walk `parsed` once and emit `gpu.shader.interpret` + `gpu.shader.reject`
+/// counters. `stage` should be `"vs"` or `"ps"`.
+pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
+    let mut alu_count: u64 = 0;
+    let mut vfetch_count: u64 = 0;
+    let mut tfetch_count: u64 = 0;
+    let mut rejects: Vec<(&'static str, u64)> = Vec::new();
+
+    let mut features: Vec<&'static str> = Vec::new();
+    for clause in &parsed.cf {
+        match clause {
+            ControlFlowInstruction::Exec {
+                address,
+                count,
+                sequence,
+                ..
+            } => {
+                for i in 0..(*count as usize) {
+                    let triple_idx = *address as usize + i;
+                    let base = triple_idx * 3;
+                    if base + 2 >= parsed.instructions.len() {
+                        break;
+                    }
+                    let words = [
+                        parsed.instructions[base],
+                        parsed.instructions[base + 1],
+                        parsed.instructions[base + 2],
+                    ];
+                    // sequence bit layout: 2 bits per triple, hi bit = is-fetch.
+                    let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
+                    if is_fetch {
+                        match decode_fetch(words) {
+                            FetchInstruction::Vertex(_) => vfetch_count += 1,
+                            FetchInstruction::Texture(tf) => {
+                                tfetch_count += 1;
+                                match tf.dimension {
+                                    0 => mark_feature(&mut features, "tfetch_1d"),
+                                    2 => mark_feature(&mut features, "tfetch_3d"),
+                                    3 => mark_feature(&mut features, "tfetch_cube"),
+                                    _ => {}
+                                }
+                                if tf.dimension != 1 {
+                                    bump(&mut rejects, "texfetch_dimension");
+                                }
+                            }
+                            FetchInstruction::Unknown { .. } => {
+                                bump(&mut rejects, "fetch_unknown");
+                            }
+                        }
+                    } else {
+                        alu_count += 1;
+                        let alu = decode_alu(words);
+                        if !vec_op_supported(alu.vector_opcode) {
+                            bump(&mut rejects, "alu_vec_unsupported");
+                        }
+                        if !scl_op_supported(alu.scalar_opcode) {
+                            bump(&mut rejects, "alu_scl_unsupported");
+                        }
+                        // Feature-of-interest detection for future phases.
+                        // Transcendentals + kill + setp + cube/max4 are the
+                        // high-value signals: they tell us which of the
+                        // deferred capabilities Sylpheed actually exercises.
+                        match alu.vector_opcode {
+                            v if v == vop::CUBE => mark_feature(&mut features, "vec_cube"),
+                            v if v == vop::MAX4 => mark_feature(&mut features, "vec_max4"),
+                            v if v == vop::KILL_EQ
+                                || v == vop::KILL_GT
+                                || v == vop::KILL_GE
+                                || v == vop::KILL_NE =>
+                            {
+                                mark_feature(&mut features, "vec_kill");
+                            }
+                            v if v == vop::CND_EQ
+                                || v == vop::CND_GE
+                                || v == vop::CND_GT =>
+                            {
+                                mark_feature(&mut features, "vec_cnd");
+                            }
+                            _ => {}
+                        }
+                        match alu.scalar_opcode {
+                            s if s == sop::EXP
+                                || s == sop::LOG
+                                || s == sop::LOGC
+                                || s == sop::SIN
+                                || s == sop::COS =>
+                            {
+                                mark_feature(&mut features, "scl_transcendental");
+                            }
+                            s if s == sop::RSQ
+                                || s == sop::RSQC
+                                || s == sop::RSQF
+                                || s == sop::SQRT =>
+                            {
+                                mark_feature(&mut features, "scl_sqrt_family");
+                            }
+                            s if s == sop::SETP_EQ
+                                || s == sop::SETP_NE
+                                || s == sop::SETP_GT
+                                || s == sop::SETP_GE
+                                || s == sop::SETP_INV
+                                || s == sop::SETP_POP
+                                || s == sop::SETP_CLR
+                                || s == sop::SETP_RSTR =>
+                            {
+                                mark_feature(&mut features, "scl_setp");
+                            }
+                            s if s == sop::KILLS_EQ
+                                || s == sop::KILLS_GT
+                                || s == sop::KILLS_GE
+                                || s == sop::KILLS_NE
+                                || s == sop::KILLS_ONE =>
+                            {
+                                mark_feature(&mut features, "scl_kills");
+                            }
+                            _ => {}
+                        }
+                        if alu.predicated {
+                            mark_feature(&mut features, "alu_predicated");
+                        }
+                    }
+                }
+            }
+            ControlFlowInstruction::LoopStart { .. }
+            | ControlFlowInstruction::LoopEnd { .. } => {
+                mark_feature(&mut features, "cf_loop");
+                bump(&mut rejects, "cf_loop");
+            }
+            ControlFlowInstruction::CondJmp { .. } => {
+                mark_feature(&mut features, "cf_cond_jmp");
+                bump(&mut rejects, "cf_cond_jmp");
+            }
+            ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
+                mark_feature(&mut features, "cf_call_return");
+                bump(&mut rejects, "cf_call_return");
+            }
+            ControlFlowInstruction::Unknown { .. } => {
+                bump(&mut rejects, "cf_unknown");
+            }
+            _ => {}
+        }
+    }
+
+    counter!("gpu.shader.interpret", "stage" => stage, "kind" => "alu")
+        .increment(alu_count);
+    counter!("gpu.shader.interpret", "stage" => stage, "kind" => "vfetch")
+        .increment(vfetch_count);
+    counter!("gpu.shader.interpret", "stage" => stage, "kind" => "tfetch")
+        .increment(tfetch_count);
+    for (reason, n) in rejects {
+        counter!("gpu.shader.reject", "stage" => stage, "reason" => reason).increment(n);
+    }
+    for name in features {
+        counter!("gpu.feature.used", "stage" => stage, "name" => name).increment(1);
+    }
+}
+
+fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
+    if !buf.contains(&name) {
+        buf.push(name);
+    }
+}
+
+fn bump(buf: &mut Vec<(&'static str, u64)>, reason: &'static str) {
+    for entry in buf.iter_mut() {
+        if entry.0 == reason {
+            entry.1 += 1;
+            return;
+        }
+    }
+    buf.push((reason, 1));
+}
+
+fn vec_op_supported(op: u8) -> bool {
+    matches!(
+        op,
+        vop::ADD
+            | vop::MUL
+            | vop::MAX
+            | vop::MIN
+            | vop::SEQ
+            | vop::SGT
+            | vop::SGE
+            | vop::SNE
+            | vop::FRC
+            | vop::TRUNC
+            | vop::FLOOR
+            | vop::MAD
+            | vop::CND_EQ
+            | vop::CND_GE
+            | vop::CND_GT
+            | vop::DOT4
+            | vop::DOT3
+            | vop::DOT2_ADD
+            | vop::MAX4
+            | vop::KILL_EQ
+            | vop::KILL_GT
+            | vop::KILL_GE
+            | vop::KILL_NE
+            | vop::DST
+    )
+}
+
+fn scl_op_supported(op: u8) -> bool {
+    matches!(
+        op,
+        sop::ADDS
+            | sop::ADDS_PREV
+            | sop::MULS
+            | sop::MULS_PREV
+            | sop::MAXS
+            | sop::MINS
+            | sop::SEQS
+            | sop::SGTS
+            | sop::SGES
+            | sop::SNES
+            | sop::FRCS
+            | sop::TRUNCS
+            | sop::FLOORS
+            | sop::EXP
+            | sop::LOG
+            | sop::LOGC
+            | sop::RCP
+            | sop::RCPC
+            | sop::RCPF
+            | sop::RSQ
+            | sop::RSQC
+            | sop::RSQF
+            | sop::SQRT
+            | sop::SUBS
+            | sop::SUBS_PREV
+            | sop::SETP_EQ
+            | sop::SETP_NE
+            | sop::SETP_GT
+            | sop::SETP_GE
+            | sop::SETP_INV
+            | sop::SETP_POP
+            | sop::SETP_CLR
+            | sop::SETP_RSTR
+            | sop::KILLS_EQ
+            | sop::KILLS_GT
+            | sop::KILLS_GE
+            | sop::KILLS_NE
+            | sop::KILLS_ONE
+            | sop::SIN
+            | sop::COS
+            | sop::RETAIN_PREV
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ucode::alu::{sop, vop};
+    use crate::ucode::control_flow::ControlFlowInstruction;
+
+    /// Build a minimal `ParsedShader` with one `Exec` clause containing
+    /// `count` ALU triples and assert the `alu` counter path works.
+    #[test]
+    fn emit_for_runs_on_synthetic_shader() {
+        let alu_w2 = (vop::ADD as u32) | ((sop::ADDS as u32) << 6) | (0xF << 12);
+        let shader = ParsedShader {
+            cf: vec![
+                ControlFlowInstruction::Exec {
+                    address: 0,
+                    count: 2,
+                    sequence: 0, // all ALU (no is-fetch bits)
+                    is_end: false,
+                    predicated: false,
+                    predicate_condition: false,
+                },
+                ControlFlowInstruction::Exit,
+            ],
+            instructions: vec![0, 0, alu_w2, 0, 0, alu_w2],
+        };
+        // Just smoke: doesn't panic. Counters are validated via metrics
+        // exporters elsewhere; we only assert this doesn't throw on a
+        // well-formed ParsedShader.
+        emit_for(&shader, "vs");
+    }
+
+    /// P8: a shader containing `LoopStart` should mark `cf_loop` as used
+    /// so the HUD can surface which deferred feature a game triggers.
+    #[test]
+    fn feature_detection_flags_loops_and_kills() {
+        let kill_alu_w2 =
+            (vop::KILL_EQ as u32) | ((sop::RETAIN_PREV as u32) << 6) | (0xF << 12);
+        let shader = ParsedShader {
+            cf: vec![
+                ControlFlowInstruction::LoopStart {
+                    address: 0,
+                    loop_id: 0,
+                },
+                ControlFlowInstruction::Exec {
+                    address: 0,
+                    count: 1,
+                    sequence: 0,
+                    is_end: true,
+                    predicated: false,
+                    predicate_condition: false,
+                },
+            ],
+            instructions: vec![0, 0, kill_alu_w2],
+        };
+        // Smoke: emits cleanly.
+        emit_for(&shader, "ps");
+    }
+
+    #[test]
+    fn unsupported_ops_classified_as_rejects() {
+        // Opcode 63 is outside our supported sets for both pipes.
+        let alu_w2 = 63u32 | (63u32 << 6) | (0xF << 12);
+        let shader = ParsedShader {
+            cf: vec![
+                ControlFlowInstruction::Exec {
+                    address: 0,
+                    count: 1,
+                    sequence: 0,
+                    is_end: true,
+                    predicated: false,
+                    predicate_condition: false,
+                },
+            ],
+            instructions: vec![0, 0, alu_w2],
+        };
+        // Again: smoke — but also confirm our static tables reject op 63.
+        assert!(!vec_op_supported(63));
+        assert!(!scl_op_supported(63));
+        emit_for(&shader, "ps");
+    }
+}
diff --git a/crates/xenia-gpu/src/shaders/mod.rs b/crates/xenia-gpu/src/shaders/mod.rs
new file mode 100644
index 0000000..fcd5bef
--- /dev/null
+++ b/crates/xenia-gpu/src/shaders/mod.rs
@@ -0,0 +1,36 @@
+//! Embedded WGSL shader sources used by the host pipeline.
+
+/// Xenos uber-shader scaffold (P3). See the comment at the top of
+/// [`XENOS_INTERP_WGSL`] for scope/limits and the plan's P3b target state.
+pub const XENOS_INTERP_WGSL: &str = include_str!("xenos_interp.wgsl");
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Parsing through naga validates the shader against WGSL spec + wgpu's
+    /// type system. We don't need a full pipeline to catch typos and layout
+    /// mistakes — this test is fast and catches regressions at `cargo test`
+    /// time.
+    #[test]
+    fn xenos_interp_wgsl_parses() {
+        let module = naga::front::wgsl::parse_str(XENOS_INTERP_WGSL)
+            .expect("xenos_interp.wgsl must parse cleanly");
+        // Sanity: we declared two entry points.
+        assert!(!module.entry_points.is_empty());
+        assert!(
+            module
+                .entry_points
+                .iter()
+                .any(|e| e.name == "vs_main" && e.stage == naga::ShaderStage::Vertex),
+            "missing vs_main entry"
+        );
+        assert!(
+            module
+                .entry_points
+                .iter()
+                .any(|e| e.name == "fs_main" && e.stage == naga::ShaderStage::Fragment),
+            "missing fs_main entry"
+        );
+    }
+}
diff --git a/crates/xenia-gpu/src/shaders/xenos_interp.wgsl b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl
new file mode 100644
index 0000000..8fcab3e
--- /dev/null
+++ b/crates/xenia-gpu/src/shaders/xenos_interp.wgsl
@@ -0,0 +1,974 @@
+// xenia-rs Xenos runtime microcode interpreter — P3b WGSL.
+//
+// Bindings (stable across P3b milestones):
+//   @group(0) @binding(0) draw_ctx      (uniform, 16 B — XenosDrawConstants)
+//   @group(0) @binding(1) xenos_consts  (uniform, ~9.2 KB — XenosConstants)
+//   @group(0) @binding(2) vs_ucode      (storage<read>, packed VS shader)
+//   @group(0) @binding(3) ps_ucode      (storage<read>, packed PS shader)
+//   @group(0) @binding(4) vertex_buffer (storage<read>, raw guest VB dwords)
+//
+// Packed shader layout (both vs_ucode & ps_ucode):
+//   [0]                            = cf_count
+//   [1 .. 1 + cf_count*3]          = CF table: (kind, primary, aux) × cf_count
+//   [1 + cf_count*3 ..]            = instruction triples (3 dwords each)
+//
+// M3 state (this file): CF walker + operand decode helpers + register file
+// scaffold are complete. ALU / fetch bodies are still stubs that fall back
+// to the procedural-circle visualisation; M4-M7 fill them in.
+
+struct XenosDrawConstants {
+    draw_index: u32,
+    vertex_count: u32,
+    prim_kind: u32,
+    _pad: u32,
+};
+
+struct XenosConstants {
+    alu:          array<vec4<f32>, 512>,
+    fetch:        array<u32, 256>,
+    bool_consts:  array<u32, 8>,
+    loop_consts:  array<u32, 32>,
+};
+
+@group(0) @binding(0) var<uniform>            draw_ctx      : XenosDrawConstants;
+// `xenos_consts` is a read-only storage buffer (not uniform) because the
+// block contains tightly-packed `array<u32, N>` fields — WGSL's uniform
+// address space requires 16-byte element stride, which would triple the
+// allocation; storage accepts the natural 4-byte stride.
+@group(0) @binding(1) var<storage, read>      xenos_consts  : XenosConstants;
+@group(0) @binding(2) var<storage, read>      vs_ucode      : array<u32>;
+@group(0) @binding(3) var<storage, read>      ps_ucode      : array<u32>;
+@group(0) @binding(4) var<storage, read>      vertex_buffer : array<u32>;
+
+// M6 — texture fetch stub. Group 1 is a single 1×1 magenta placeholder for
+// all texture slots; the P5 texture cache will replace this with per-slot
+// bindings.
+@group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
+@group(1) @binding(1) var xenos_samp : sampler;
+
+// ── CF kind codes; must match `xenia_gpu::ucode::cf_kind`. ─────────────
+const CF_KIND_EXEC:        u32 = 0u;
+const CF_KIND_EXEC_END:    u32 = 1u;
+const CF_KIND_ALLOC:       u32 = 2u;
+const CF_KIND_EXIT:        u32 = 3u;
+const CF_KIND_LOOP_START:  u32 = 4u;
+const CF_KIND_LOOP_END:    u32 = 5u;
+const CF_KIND_COND_JMP:    u32 = 6u;
+const CF_KIND_COND_CALL:   u32 = 7u;
+const CF_KIND_RETURN:      u32 = 8u;
+const CF_KIND_UNKNOWN:     u32 = 15u;
+
+// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
+const ALLOC_KIND_POSITION:      u32 = 0u;
+const ALLOC_KIND_INTERPOLATORS: u32 = 1u;
+const ALLOC_KIND_COLORS:        u32 = 2u;
+
+// Per-invocation Xenos register file + scalar `ps` + predicate.
+var<private> registers: array<vec4<f32>, 128>;
+var<private> ps: f32;
+var<private> predicate: bool;
+
+// Currently-active export alloc kind; set by Alloc clauses.
+var<private> current_alloc: u32;
+
+// P3c additions:
+//   `kill_flag` — set by vector/scalar kill ops; PS calls `discard` after the
+//                 interpreter exits. (`discard` inside a helper function is
+//                 allowed in WGSL, but keeping it at the entry level makes
+//                 control flow easier to read.)
+//   `loop_depth`/`loop_counters` — bookkeeping for LoopStart/LoopEnd CF
+//                 clauses. Xenos supports up to 4 nested loops.
+//   `reject_mask` — bitfield of op categories we failed to interpret, so the
+//                 PS fallback color + host-side diagnostics can surface it.
+var<private> kill_flag: bool;
+var<private> loop_depth: u32;
+var<private> loop_counters: array<u32, 4>;
+var<private> loop_starts: array<u32, 4>;
+var<private> reject_mask: u32;
+
+const REJECT_ALU_VEC:      u32 = 1u;
+const REJECT_ALU_SCL:      u32 = 2u;
+const REJECT_TEX_NON2D:    u32 = 4u;
+const REJECT_VFETCH_FMT:   u32 = 8u;
+const REJECT_CF_JUMP:      u32 = 16u;
+const REJECT_CF_CALL:      u32 = 32u;
+const REJECT_LOOP_OVERFLOW:u32 = 64u;
+
+struct VsOut {
+    @builtin(position) position: vec4<f32>,
+    @location(0) color: vec4<f32>,
+};
+
+struct FsOut {
+    @location(0) color0: vec4<f32>,
+};
+
+// ── CF helpers: read (kind,primary,aux) from a packed ucode storage buffer.
+
+fn vs_cf_count() -> u32 { return vs_ucode[0]; }
+fn ps_cf_count() -> u32 { return ps_ucode[0]; }
+
+fn vs_cf_kind(i: u32) -> u32    { return vs_ucode[1u + i * 3u + 0u] & 0xFFu; }
+fn vs_cf_primary(i: u32) -> u32 { return vs_ucode[1u + i * 3u + 1u]; }
+fn vs_cf_aux(i: u32) -> u32     { return vs_ucode[1u + i * 3u + 2u]; }
+fn ps_cf_kind(i: u32) -> u32    { return ps_ucode[1u + i * 3u + 0u] & 0xFFu; }
+fn ps_cf_primary(i: u32) -> u32 { return ps_ucode[1u + i * 3u + 1u]; }
+fn ps_cf_aux(i: u32) -> u32     { return ps_ucode[1u + i * 3u + 2u]; }
+
+fn vs_instr_base() -> u32 { return 1u + vs_cf_count() * 3u; }
+fn ps_instr_base() -> u32 { return 1u + ps_cf_count() * 3u; }
+
+// Fetch one of the 3 dwords of an ALU/fetch triple at triple index `t`.
+fn vs_instr_dword(t: u32, which: u32) -> u32 {
+    return vs_ucode[vs_instr_base() + t * 3u + which];
+}
+fn ps_instr_dword(t: u32, which: u32) -> u32 {
+    return ps_ucode[ps_instr_base() + t * 3u + which];
+}
+
+// ── Operand helpers: M3 scaffold; M4 wires them into interpret_alu.
+
+fn apply_swizzle(v: vec4<f32>, swizzle: u32) -> vec4<f32> {
+    // Swizzle is 8 bits: 2 bits per output lane, xyzw order.
+    let sx = (swizzle >> 0u) & 3u;
+    let sy = (swizzle >> 2u) & 3u;
+    let sz = (swizzle >> 4u) & 3u;
+    let sw = (swizzle >> 6u) & 3u;
+    return vec4<f32>(v[sx], v[sy], v[sz], v[sw]);
+}
+
+fn apply_modifiers(v: vec4<f32>, negate: bool, take_abs: bool) -> vec4<f32> {
+    var r = v;
+    if take_abs {
+        r = abs(r);
+    }
+    if negate {
+        r = -r;
+    }
+    return r;
+}
+
+fn write_reg_masked(idx: u32, mask: u32, value: vec4<f32>) {
+    if idx >= 128u {
+        return;
+    }
+    let prev = registers[idx];
+    var out = prev;
+    if (mask & 1u) != 0u { out.x = value.x; }
+    if (mask & 2u) != 0u { out.y = value.y; }
+    if (mask & 4u) != 0u { out.z = value.z; }
+    if (mask & 8u) != 0u { out.w = value.w; }
+    registers[idx] = out;
+}
+
+// ── Xenos ALU opcodes. Values match canary's `AluVectorOpcode` and
+// `AluScalarOpcode` enums in `ucode.h:1001,1354` (also mirrored in
+// `xenia_gpu::ucode::alu::{vop,sop}`).
+const VOP_ADD:          u32 = 0u;
+const VOP_MUL:          u32 = 1u;
+const VOP_MAX:          u32 = 2u;
+const VOP_MIN:          u32 = 3u;
+const VOP_SEQ:          u32 = 4u;
+const VOP_SGT:          u32 = 5u;
+const VOP_SGE:          u32 = 6u;
+const VOP_SNE:          u32 = 7u;
+const VOP_FRC:          u32 = 8u;
+const VOP_TRUNC:        u32 = 9u;
+const VOP_FLOOR:        u32 = 10u;
+const VOP_MAD:          u32 = 11u;
+const VOP_CND_EQ:       u32 = 12u;
+const VOP_CND_GE:       u32 = 13u;
+const VOP_CND_GT:       u32 = 14u;
+const VOP_DP4:          u32 = 15u;
+const VOP_DP3:          u32 = 16u;
+const VOP_DP2_ADD:      u32 = 17u;
+const VOP_CUBE:         u32 = 18u;
+const VOP_MAX4:         u32 = 19u;
+const VOP_KILL_EQ:      u32 = 24u;
+const VOP_KILL_GT:      u32 = 25u;
+const VOP_KILL_GE:      u32 = 26u;
+const VOP_KILL_NE:      u32 = 27u;
+const VOP_DST:          u32 = 28u;
+const VOP_MAX_A:        u32 = 29u;
+
+const SOP_ADDS:         u32 = 0u;
+const SOP_ADDS_PREV:    u32 = 1u;
+const SOP_MULS:         u32 = 2u;
+const SOP_MULS_PREV:    u32 = 3u;
+const SOP_MAXS:         u32 = 5u;
+const SOP_MINS:         u32 = 6u;
+const SOP_SEQS:         u32 = 7u;
+const SOP_SGTS:         u32 = 8u;
+const SOP_SGES:         u32 = 9u;
+const SOP_SNES:         u32 = 10u;
+const SOP_FRCS:         u32 = 11u;
+const SOP_TRUNCS:       u32 = 12u;
+const SOP_FLOORS:       u32 = 13u;
+const SOP_EXP:          u32 = 14u;
+const SOP_LOGC:         u32 = 15u;
+const SOP_LOG:          u32 = 16u;
+const SOP_RCPC:         u32 = 17u;
+const SOP_RCPF:         u32 = 18u;
+const SOP_RCP:          u32 = 19u;
+const SOP_RSQC:         u32 = 20u;
+const SOP_RSQF:         u32 = 21u;
+const SOP_RSQ:          u32 = 22u;
+const SOP_SUBS:         u32 = 25u;
+const SOP_SUBS_PREV:    u32 = 26u;
+const SOP_SETP_EQ:      u32 = 27u;
+const SOP_SETP_NE:      u32 = 28u;
+const SOP_SETP_GT:      u32 = 29u;
+const SOP_SETP_GE:      u32 = 30u;
+const SOP_SETP_INV:     u32 = 31u;
+const SOP_SETP_POP:     u32 = 32u;
+const SOP_SETP_CLR:     u32 = 33u;
+const SOP_SETP_RSTR:    u32 = 34u;
+const SOP_KILLS_EQ:     u32 = 35u;
+const SOP_KILLS_GT:     u32 = 36u;
+const SOP_KILLS_GE:     u32 = 37u;
+const SOP_KILLS_NE:     u32 = 38u;
+const SOP_KILLS_ONE:    u32 = 39u;
+const SOP_SQRT:         u32 = 40u;
+const SOP_SIN:          u32 = 48u;
+const SOP_COS:          u32 = 49u;
+const SOP_RETAIN_PREV:  u32 = 50u;
+
+// Read a vec4 source from the register file. Treats the src index as a
+// direct r# reference (ignores c# selector + swizzle/modifiers for MVP).
+// M4+ will extend this to decode the full operand header.
+fn read_src(idx: u32) -> vec4<f32> {
+    return registers[idx & 0x7Fu];
+}
+
+fn exec_vector_op(op: u32, a: vec4<f32>, b: vec4<f32>, c: vec4<f32>) -> vec4<f32> {
+    switch op {
+        case VOP_ADD:     { return a + b; }
+        case VOP_MUL:     { return a * b; }
+        case VOP_MAX:     { return max(a, b); }
+        case VOP_MIN:     { return min(a, b); }
+        case VOP_SEQ:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x == b.x),
+                select(0.0, 1.0, a.y == b.y),
+                select(0.0, 1.0, a.z == b.z),
+                select(0.0, 1.0, a.w == b.w),
+            );
+        }
+        case VOP_SGT:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x >  b.x),
+                select(0.0, 1.0, a.y >  b.y),
+                select(0.0, 1.0, a.z >  b.z),
+                select(0.0, 1.0, a.w >  b.w),
+            );
+        }
+        case VOP_SGE:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x >= b.x),
+                select(0.0, 1.0, a.y >= b.y),
+                select(0.0, 1.0, a.z >= b.z),
+                select(0.0, 1.0, a.w >= b.w),
+            );
+        }
+        case VOP_SNE:     {
+            return vec4<f32>(
+                select(0.0, 1.0, a.x != b.x),
+                select(0.0, 1.0, a.y != b.y),
+                select(0.0, 1.0, a.z != b.z),
+                select(0.0, 1.0, a.w != b.w),
+            );
+        }
+        case VOP_FRC:     { return fract(a); }
+        case VOP_TRUNC:   { return trunc(a); }
+        case VOP_FLOOR:   { return floor(a); }
+        case VOP_MAD:     { return a * b + c; }
+        case VOP_CND_EQ: {
+            // dst = (src0 == 0) ? src1 : src2
+            return vec4<f32>(
+                select(c.x, b.x, a.x == 0.0),
+                select(c.y, b.y, a.y == 0.0),
+                select(c.z, b.z, a.z == 0.0),
+                select(c.w, b.w, a.w == 0.0),
+            );
+        }
+        case VOP_CND_GE: {
+            return vec4<f32>(
+                select(c.x, b.x, a.x >= 0.0),
+                select(c.y, b.y, a.y >= 0.0),
+                select(c.z, b.z, a.z >= 0.0),
+                select(c.w, b.w, a.w >= 0.0),
+            );
+        }
+        case VOP_CND_GT: {
+            return vec4<f32>(
+                select(c.x, b.x, a.x > 0.0),
+                select(c.y, b.y, a.y > 0.0),
+                select(c.z, b.z, a.z > 0.0),
+                select(c.w, b.w, a.w > 0.0),
+            );
+        }
+        case VOP_DP4:     {
+            let d = dot(a, b);
+            return vec4<f32>(d, d, d, d);
+        }
+        case VOP_DP3:     {
+            let d = dot(a.xyz, b.xyz);
+            return vec4<f32>(d, d, d, d);
+        }
+        case VOP_DP2_ADD: {
+            let d = a.x * b.x + a.y * b.y + c.x;
+            return vec4<f32>(d, d, d, d);
+        }
+        case VOP_MAX4: {
+            let m = max(max(a.x, a.y), max(a.z, a.w));
+            return vec4<f32>(m, m, m, m);
+        }
+        case VOP_KILL_EQ: {
+            if a.x == b.x || a.y == b.y || a.z == b.z || a.w == b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_KILL_GT: {
+            if a.x > b.x || a.y > b.y || a.z > b.z || a.w > b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_KILL_GE: {
+            if a.x >= b.x || a.y >= b.y || a.z >= b.z || a.w >= b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_KILL_NE: {
+            if a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w {
+                kill_flag = true;
+                return vec4<f32>(1.0, 1.0, 1.0, 1.0);
+            }
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+        case VOP_DST: {
+            // dest = (1, src0.y * src1.y, src0.z, src1.w)
+            return vec4<f32>(1.0, a.y * b.y, a.z, b.w);
+        }
+        case VOP_CUBE, VOP_MAX_A: {
+            // Cube face projection + MAX+AR are rare in P3c's target set;
+            // forward to max() fallback so MAX_A degrades gracefully and
+            // CUBE does *something* useful (max-axis selection).
+            reject_mask |= REJECT_ALU_VEC;
+            return max(a, b);
+        }
+        default: {
+            // Unsupported — identity fallback + diagnostic flag.
+            reject_mask |= REJECT_ALU_VEC;
+            return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+        }
+    }
+}
+
+// Scalar op executor. Takes:
+//   `src_a` / `src_b` — the two scalar source components (most ops use
+//   only `src_a`; binary ops — ADDS/MULS/MAXS/MINS/SUBS — use both).
+//   `prev` — current `ps` chain value.
+// Kill/setp ops with side-effects also touch `kill_flag`/`predicate`.
+fn exec_scalar_op(op: u32, src_a: f32, src_b: f32, prev: f32) -> f32 {
+    switch op {
+        case SOP_ADDS:        { return src_a + src_b; }
+        case SOP_ADDS_PREV:   { return src_a + prev; }
+        case SOP_MULS:        { return src_a * src_b; }
+        case SOP_MULS_PREV:   { return src_a * prev; }
+        case SOP_SUBS:        { return src_a - src_b; }
+        case SOP_SUBS_PREV:   { return src_a - prev; }
+        case SOP_MAXS:        { return max(src_a, src_b); }
+        case SOP_MINS:        { return min(src_a, src_b); }
+        case SOP_SEQS:        { return select(0.0, 1.0, src_a == 0.0); }
+        case SOP_SGTS:        { return select(0.0, 1.0, src_a >  0.0); }
+        case SOP_SGES:        { return select(0.0, 1.0, src_a >= 0.0); }
+        case SOP_SNES:        { return select(0.0, 1.0, src_a != 0.0); }
+        case SOP_FRCS:        { return fract(src_a); }
+        case SOP_TRUNCS:      { return trunc(src_a); }
+        case SOP_FLOORS:      { return floor(src_a); }
+        case SOP_EXP:         {
+            // exp/EXP_IEEE: pow(2, src). Canary says src==0 -> 1.0, which
+            // WGSL's exp2 already produces for 0.
+            return exp2(src_a);
+        }
+        case SOP_LOG, SOP_LOGC: {
+            // log/logc both compute log2. Canary's LOGC clamps -INF to
+            // -FLT_MAX, but WGSL's log2 doesn't produce -INF for finite
+            // inputs > 0, and we leave src <= 0 to be the shader author's
+            // problem (identity fallback via select).
+            return select(log2(src_a), 0.0, src_a == 1.0);
+        }
+        case SOP_RCP, SOP_RCPC, SOP_RCPF: {
+            // IEEE reciprocal with src==0 guard; the clamp-variants differ
+            // only in how they treat INF/NaN, which is shader-author-rare.
+            return select(0.0, 1.0 / src_a, src_a != 0.0);
+        }
+        case SOP_RSQ, SOP_RSQC, SOP_RSQF: {
+            return select(0.0, inverseSqrt(src_a), src_a > 0.0);
+        }
+        case SOP_SQRT:        { return select(0.0, sqrt(src_a), src_a >= 0.0); }
+        case SOP_SIN:         { return sin(src_a); }
+        case SOP_COS:         { return cos(src_a); }
+        // Predicate writes — update `predicate` and produce a result that
+        // the surrounding ALU slot can still consume via `ps`. Canary's
+        // setp-variant dst-write semantics are preserved.
+        case SOP_SETP_EQ: {
+            predicate = (src_a == 0.0);
+            return select(1.0, 0.0, src_a == 0.0);
+        }
+        case SOP_SETP_NE: {
+            predicate = (src_a != 0.0);
+            return select(1.0, 0.0, src_a != 0.0);
+        }
+        case SOP_SETP_GT: {
+            predicate = (src_a > 0.0);
+            return select(1.0, 0.0, src_a > 0.0);
+        }
+        case SOP_SETP_GE: {
+            predicate = (src_a >= 0.0);
+            return select(1.0, 0.0, src_a >= 0.0);
+        }
+        case SOP_SETP_INV: {
+            if src_a == 1.0 {
+                predicate = true;
+                return 0.0;
+            } else {
+                predicate = false;
+                return select(src_a, 1.0, src_a == 0.0);
+            }
+        }
+        case SOP_SETP_POP: {
+            if src_a - 1.0 <= 0.0 {
+                predicate = true;
+                return 0.0;
+            } else {
+                predicate = false;
+                return src_a - 1.0;
+            }
+        }
+        case SOP_SETP_CLR: {
+            predicate = false;
+            // FLT_MAX sentinel. WGSL's bitcast keeps this portable.
+            return bitcast<f32>(0x7F7FFFFFu);
+        }
+        case SOP_SETP_RSTR: {
+            if src_a == 0.0 {
+                predicate = true;
+                return 0.0;
+            } else {
+                predicate = false;
+                return src_a;
+            }
+        }
+        // Pixel kill — set kill_flag; the dest value is 1.0 / 0.0 per
+        // canary, and `discard` runs at fragment exit when the flag is set.
+        case SOP_KILLS_EQ: {
+            if src_a == 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_GT: {
+            if src_a > 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_GE: {
+            if src_a >= 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_NE: {
+            if src_a != 0.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_KILLS_ONE: {
+            if src_a == 1.0 { kill_flag = true; return 1.0; }
+            return 0.0;
+        }
+        case SOP_RETAIN_PREV: { return prev; }
+        default: {
+            reject_mask |= REJECT_ALU_SCL;
+            return 0.0;
+        }
+    }
+}
+
+fn interpret_alu(t: u32, is_vertex: bool) {
+    // Read the 3-dword instruction triple.
+    var w0: u32;
+    var w1: u32;
+    var w2: u32;
+    if is_vertex {
+        w0 = vs_instr_dword(t, 0u);
+        w1 = vs_instr_dword(t, 1u);
+        w2 = vs_instr_dword(t, 2u);
+    } else {
+        w0 = ps_instr_dword(t, 0u);
+        w1 = ps_instr_dword(t, 1u);
+        w2 = ps_instr_dword(t, 2u);
+    }
+    // Field extraction matches `xenia_gpu::ucode::alu::decode_alu`.
+    let vec_op  = w2 & 0x3Fu;
+    let scl_op  = (w2 >> 6u) & 0x3Fu;
+    let vec_dst = (w2 >> 16u) & 0x7Fu;
+    let scl_dst = (w2 >> 24u) & 0x7Fu;
+    let vec_wm  = (w2 >> 12u) & 0xFu;
+    let scl_wm  = (w2 >> 8u)  & 0xFu;
+    let src_a   = w0 & 0xFFu;
+    let src_b   = (w0 >> 8u)  & 0xFFu;
+    let src_c   = (w0 >> 16u) & 0xFFu;
+    let predicated           = ((w0 >> 27u) & 1u) != 0u;
+    let predicate_condition  = ((w0 >> 28u) & 1u) != 0u;
+    let scalar_src_is_ps     = ((w0 >> 26u) & 1u) != 0u;
+    // `w1` holds per-operand swizzle + negate/abs/c-vs-r flags. The MVP
+    // treats every source as a full r#, no modifiers — M4+ decodes it.
+    _ = w1;
+
+    // Honor per-instruction predicate: skip when predicated and the
+    // predicate doesn't match the required condition.
+    if predicated && (predicate != predicate_condition) {
+        return;
+    }
+
+    // Vector pipe.
+    let a = read_src(src_a);
+    let b = read_src(src_b);
+    let c = read_src(src_c);
+    let vec_result = exec_vector_op(vec_op, a, b, c);
+    if vec_wm != 0u {
+        write_reg_masked(vec_dst, vec_wm, vec_result);
+    }
+
+    // Scalar pipe. Binary scalar ops read (src_a.x, src_b.x); ps-variants
+    // read src_a.x and chain through the running `ps`. When `scalar_src_is_ps`
+    // is set the operand selector chooses `ps` as the primary source.
+    let scl_src_a = select(a.x, ps, scalar_src_is_ps);
+    let scl_src_b = b.x;
+    let new_ps = exec_scalar_op(scl_op, scl_src_a, scl_src_b, ps);
+    ps = new_ps;
+    if scl_wm != 0u {
+        write_reg_masked(scl_dst, scl_wm, vec4<f32>(new_ps, new_ps, new_ps, new_ps));
+    }
+}
+// Xenos VertexFormat values from `xenos.h:641`.
+const VFMT_8_8_8_8:         u32 = 6u;
+const VFMT_2_10_10_10:      u32 = 7u;
+const VFMT_10_11_11:        u32 = 16u;
+const VFMT_11_11_10:        u32 = 17u;
+const VFMT_16_16:           u32 = 25u;
+const VFMT_16_16_16_16:     u32 = 26u;
+const VFMT_16_16_FLOAT:     u32 = 31u;
+const VFMT_16_16_16_16_FLOAT:u32 = 32u;
+const VFMT_32:              u32 = 33u;
+const VFMT_32_32:           u32 = 34u;
+const VFMT_32_32_32_32:     u32 = 35u;
+const VFMT_32_FLOAT:        u32 = 36u;
+const VFMT_32_32_FLOAT:     u32 = 37u;
+const VFMT_32_32_32_32_FLOAT:u32 = 38u;
+const VFMT_32_32_32_FLOAT:  u32 = 57u;
+
+// Decode vertex fetch instruction fields (canary's VertexFetchInstruction
+// layout in `ucode.h:690`):
+//   w0 [4:0]   opcode
+//   w0 [10:5]  src_reg[5:0]
+//   w0 [17:11] dst_reg[6:0] + must-be-one
+//   w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
+//   w1 [21:16] format[5:0]
+//   w2 [7:0]   stride (in dwords)
+//   w2 [30:8]  offset (signed, in dwords)
+//
+// Pragmatic subset: we read format/stride; offset, exp_adjust, swizzle,
+// sign/normalization flags are used for the most-common normalized-unsigned
+// path. Rejects set `REJECT_VFETCH_FMT`.
+fn interpret_vertex_fetch(t: u32) {
+    let w0 = vs_instr_dword(t, 0u);
+    let w1 = vs_instr_dword(t, 1u);
+    let w2 = vs_instr_dword(t, 2u);
+    let fetch_const = (w0 >> 5u) & 0x1Fu;
+    let dst_reg = (w0 >> 10u) & 0x7Fu;
+    let src_reg = (w0 >> 17u) & 0x7Fu;
+    let format  = (w1 >> 16u) & 0x3Fu;
+    let stride  = w2 & 0xFFu;
+
+    // Vertex-fetch constant: dword 0 carries (type[1:0], address[31:2]);
+    // dword 1 carries (endian[1:0], size[25:2]).
+    let fc0 = xenos_consts.fetch[fetch_const * 2u + 0u];
+    let base_dwords = (fc0 & 0xFFFFFFFCu) >> 2u;
+
+    let vidx = u32(registers[src_reg & 0x7Fu].x);
+    // Per-vertex byte offset; stride==0 means 1 element total (non-indexed).
+    let effective_stride = select(stride, 4u, stride == 0u);
+    let addr = base_dwords + vidx * effective_stride;
+
+    let n = arrayLength(&vertex_buffer);
+    var result: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);
+
+    switch format {
+        case VFMT_32_32_32_32_FLOAT: {
+            if addr + 3u < n {
+                result = vec4<f32>(
+                    bitcast<f32>(vertex_buffer[addr + 0u]),
+                    bitcast<f32>(vertex_buffer[addr + 1u]),
+                    bitcast<f32>(vertex_buffer[addr + 2u]),
+                    bitcast<f32>(vertex_buffer[addr + 3u]),
+                );
+            }
+        }
+        case VFMT_32_32_32_FLOAT: {
+            if addr + 2u < n {
+                result = vec4<f32>(
+                    bitcast<f32>(vertex_buffer[addr + 0u]),
+                    bitcast<f32>(vertex_buffer[addr + 1u]),
+                    bitcast<f32>(vertex_buffer[addr + 2u]),
+                    1.0,
+                );
+            }
+        }
+        case VFMT_32_32_FLOAT: {
+            if addr + 1u < n {
+                result = vec4<f32>(
+                    bitcast<f32>(vertex_buffer[addr + 0u]),
+                    bitcast<f32>(vertex_buffer[addr + 1u]),
+                    0.0,
+                    1.0,
+                );
+            }
+        }
+        case VFMT_32_FLOAT: {
+            if addr < n {
+                result = vec4<f32>(bitcast<f32>(vertex_buffer[addr]), 0.0, 0.0, 1.0);
+            }
+        }
+        case VFMT_8_8_8_8: {
+            if addr < n {
+                result = unpack4x8unorm(vertex_buffer[addr]);
+            }
+        }
+        case VFMT_16_16_FLOAT: {
+            if addr < n {
+                let h = unpack2x16float(vertex_buffer[addr]);
+                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
+            }
+        }
+        case VFMT_16_16_16_16_FLOAT: {
+            if addr + 1u < n {
+                let h0 = unpack2x16float(vertex_buffer[addr]);
+                let h1 = unpack2x16float(vertex_buffer[addr + 1u]);
+                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
+            }
+        }
+        case VFMT_16_16: {
+            if addr < n {
+                // Default to signed normalized; unsigned variants differ
+                // only for sign-extension and are less common on Xenos VBs.
+                let h = unpack2x16snorm(vertex_buffer[addr]);
+                result = vec4<f32>(h.x, h.y, 0.0, 1.0);
+            }
+        }
+        case VFMT_16_16_16_16: {
+            if addr + 1u < n {
+                let h0 = unpack2x16snorm(vertex_buffer[addr]);
+                let h1 = unpack2x16snorm(vertex_buffer[addr + 1u]);
+                result = vec4<f32>(h0.x, h0.y, h1.x, h1.y);
+            }
+        }
+        case VFMT_2_10_10_10: {
+            // Unpack 10-bit R/G/B (signed or unsigned, default unsigned
+            // normalized) + 2-bit A. Unnormalized not exercised here.
+            if addr < n {
+                let packed = vertex_buffer[addr];
+                let r = f32(packed & 0x3FFu) / 1023.0;
+                let g = f32((packed >> 10u) & 0x3FFu) / 1023.0;
+                let b = f32((packed >> 20u) & 0x3FFu) / 1023.0;
+                let a = f32((packed >> 30u) & 0x3u) / 3.0;
+                result = vec4<f32>(r, g, b, a);
+            }
+        }
+        default: {
+            reject_mask |= REJECT_VFETCH_FMT;
+            // Identity fallback preserves vertex-index visibility.
+            if addr < n {
+                result = vec4<f32>(f32(vertex_buffer[addr]) * 0.0, 0.0, 0.0, 1.0);
+            }
+        }
+    }
+
+    registers[dst_reg & 0x7Fu] = result;
+}
+
+// M6 — minimum-viable texture fetch. Every fetch samples the 1×1 magenta
+// dummy bound at group(1); the real per-slot texture cache lands with P5.
+// Reads (u, v) from the source register's .xy and writes the sample into
+// the destination register. `textureSampleLevel` works in both VS and PS
+// (no implicit derivatives), so no per-stage specialisation needed.
+fn interpret_texture_fetch(t: u32, is_vertex: bool) {
+    var w0: u32 = 0u;
+    if is_vertex {
+        w0 = vs_instr_dword(t, 0u);
+    } else {
+        w0 = ps_instr_dword(t, 0u);
+    }
+    let dst_reg = (w0 >> 10u) & 0x7Fu;
+    let src_reg = (w0 >> 17u) & 0x7Fu;
+    let uv = registers[src_reg & 0x7Fu].xy;
+    let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
+    registers[dst_reg & 0x7Fu] = sample;
+}
+
+// Walk an Exec clause's instruction triples.
+//   sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
+//             (we ignore in MVP); bit 1 = is-fetch.
+fn exec_vs(address: u32, count: u32, sequence: u32) {
+    for (var i: u32 = 0u; i < count; i = i + 1u) {
+        let t = address + i;
+        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
+        if is_fetch {
+            let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
+            // 0x00 = vertex fetch, 0x01 = texture fetch.
+            if opcode == 0u {
+                interpret_vertex_fetch(t);
+            } else if opcode == 1u {
+                interpret_texture_fetch(t, true);
+            }
+        } else {
+            interpret_alu(t, true);
+        }
+    }
+}
+fn exec_ps(address: u32, count: u32, sequence: u32) {
+    for (var i: u32 = 0u; i < count; i = i + 1u) {
+        let t = address + i;
+        let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
+        if is_fetch {
+            interpret_texture_fetch(t, false);
+        } else {
+            interpret_alu(t, false);
+        }
+    }
+}
+
+// Reset the per-invocation register state to a known baseline.
+fn reset_state() {
+    for (var i: u32 = 0u; i < 128u; i = i + 1u) {
+        registers[i] = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+    }
+    ps = 0.0;
+    predicate = false;
+    current_alloc = 0u;
+    kill_flag = false;
+    loop_depth = 0u;
+    reject_mask = 0u;
+    for (var i: u32 = 0u; i < 4u; i = i + 1u) {
+        loop_counters[i] = 0u;
+        loop_starts[i] = 0u;
+    }
+}
+
+// ── Stage entry points.
+
+// M7 register slots for exports. VS writes position at oPos (convention:
+// Xenos export 0 within an `Alloc(Position)` range lands in registers[32])
+// and a set of interpolators. We track both via `current_alloc`: writes
+// inside each alloc range are tagged and copied out at Exit.
+const OPOS_REG:       u32 = 32u;  // synthetic slot used by the interpreter
+const OCOLOR_REG:     u32 = 33u;  // color0 scratch slot
+
+@vertex
+fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {
+    reset_state();
+
+    // Seed r0 with the vertex index so simple shaders (or the procedural
+    // fallback) have access without a real vertex fetch.
+    registers[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);
+    // Seed the export slots with a procedural fallback: if the shader
+    // never writes oPos / oColor, this keeps the output visible rather
+    // than collapsing to (0,0) which would skip rasterization.
+    let total = max(draw_ctx.vertex_count, 1u);
+    let t_param = f32(vidx) / f32(total);
+    let angle = t_param * 6.2831853;
+    let radius = 0.35;
+    registers[OPOS_REG] = vec4<f32>(cos(angle) * radius, sin(angle) * radius, 0.0, 1.0);
+    let d = f32(draw_ctx.draw_index);
+    registers[OCOLOR_REG] = vec4<f32>(
+        0.5 + 0.5 * sin(d * 0.37),
+        0.5 + 0.5 * sin(d * 0.51 + 2.0),
+        0.5 + 0.5 * sin(d * 0.73 + 4.0),
+        1.0,
+    );
+
+    // Dead-binding guard for VERTEX-stage-only vertex_buffer access.
+    let vb_live = f32(vertex_buffer[0]) * 0.0;
+
+    // Walk the VS CF table.
+    walk_cf_vs();
+
+    var out: VsOut;
+    // Use registers[OPOS_REG] as position; the procedural fallback above
+    // seeded it so an un-interpreted shader still draws a recognisable
+    // circle.
+    out.position = vec4<f32>(registers[OPOS_REG].xyz, registers[OPOS_REG].w);
+    out.color = vec4<f32>(registers[OCOLOR_REG].rgb + vec3<f32>(vb_live), registers[OCOLOR_REG].a);
+    return out;
+}
+
+@fragment
+fn fs_main(in: VsOut) -> FsOut {
+    reset_state();
+
+    walk_cf_ps();
+
+    // Kill: pixel shader `kill*` / `kills_*` opcodes set `kill_flag`;
+    // `discard` at the entry level (outside any helper) is the only way to
+    // guarantee early-out in WGSL.
+    if kill_flag {
+        discard;
+    }
+
+    var out: FsOut;
+    out.color0 = in.color;
+    return out;
+}
+
+// ── CF walker per stage. Shared logic: LOOP_START/LOOP_END iterate up to 16×
+// from `xenos_consts.loop_consts[loop_id]`, COND_JMP honours the instruction's
+// predicate flags, COND_CALL / RETURN set the reject mask (needs a call stack
+// we don't have). A hard iteration cap keeps the GPU from hanging on
+// malformed or extreme shaders.
+const CF_WALKER_MAX_ITER: u32 = 4096u;
+
+fn walk_cf_vs() {
+    let cf_n = vs_cf_count();
+    var cf_i: u32 = 0u;
+    var iter: u32 = 0u;
+    loop {
+        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
+        iter = iter + 1u;
+        let kind = vs_cf_kind(cf_i);
+        let primary = vs_cf_primary(cf_i);
+        let aux = vs_cf_aux(cf_i);
+        var advance: bool = true;
+        var stop: bool = false;
+        switch kind {
+            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
+                let count = aux & 0xFFu;
+                let sequence = aux >> 8u;
+                exec_vs(primary, count, sequence);
+                if kind == CF_KIND_EXEC_END { stop = true; }
+            }
+            case CF_KIND_ALLOC: { current_alloc = primary; }
+            case CF_KIND_EXIT:  { stop = true; }
+            case CF_KIND_LOOP_START: {
+                let loop_id = aux & 0x1Fu;
+                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
+                if loop_count > 16u {
+                    loop_count = 16u;
+                    reject_mask |= REJECT_LOOP_OVERFLOW;
+                }
+                if loop_count > 0u && loop_depth < 4u {
+                    loop_starts[loop_depth] = cf_i;
+                    loop_counters[loop_depth] = loop_count;
+                    loop_depth = loop_depth + 1u;
+                }
+                // count==0 → fall through; matching LOOP_END will pop.
+            }
+            case CF_KIND_LOOP_END: {
+                if loop_depth > 0u {
+                    let d = loop_depth - 1u;
+                    if loop_counters[d] > 1u {
+                        loop_counters[d] = loop_counters[d] - 1u;
+                        cf_i = loop_starts[d] + 1u;
+                        advance = false;
+                    } else {
+                        loop_counters[d] = 0u;
+                        loop_depth = d;
+                    }
+                }
+            }
+            case CF_KIND_COND_JMP: {
+                let pred_bits = aux;
+                let is_pred  = (pred_bits & 1u) != 0u;
+                let pred_cnd = (pred_bits & 2u) != 0u;
+                if !is_pred || predicate == pred_cnd {
+                    cf_i = primary;
+                    advance = false;
+                }
+            }
+            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
+                // No call stack — mark and continue.
+                reject_mask |= REJECT_CF_CALL;
+            }
+            default: { reject_mask |= REJECT_CF_JUMP; }
+        }
+        if stop { break; }
+        if advance { cf_i = cf_i + 1u; }
+    }
+}
+
+fn walk_cf_ps() {
+    let cf_n = ps_cf_count();
+    var cf_i: u32 = 0u;
+    var iter: u32 = 0u;
+    loop {
+        if cf_i >= cf_n || iter >= CF_WALKER_MAX_ITER { break; }
+        iter = iter + 1u;
+        let kind = ps_cf_kind(cf_i);
+        let primary = ps_cf_primary(cf_i);
+        let aux = ps_cf_aux(cf_i);
+        var advance: bool = true;
+        var stop: bool = false;
+        switch kind {
+            case CF_KIND_EXEC, CF_KIND_EXEC_END: {
+                let count = aux & 0xFFu;
+                let sequence = aux >> 8u;
+                exec_ps(primary, count, sequence);
+                if kind == CF_KIND_EXEC_END { stop = true; }
+            }
+            case CF_KIND_ALLOC: { current_alloc = primary; }
+            case CF_KIND_EXIT:  { stop = true; }
+            case CF_KIND_LOOP_START: {
+                let loop_id = aux & 0x1Fu;
+                var loop_count = xenos_consts.loop_consts[loop_id] & 0xFFu;
+                if loop_count > 16u {
+                    loop_count = 16u;
+                    reject_mask |= REJECT_LOOP_OVERFLOW;
+                }
+                if loop_count > 0u && loop_depth < 4u {
+                    loop_starts[loop_depth] = cf_i;
+                    loop_counters[loop_depth] = loop_count;
+                    loop_depth = loop_depth + 1u;
+                }
+            }
+            case CF_KIND_LOOP_END: {
+                if loop_depth > 0u {
+                    let d = loop_depth - 1u;
+                    if loop_counters[d] > 1u {
+                        loop_counters[d] = loop_counters[d] - 1u;
+                        cf_i = loop_starts[d] + 1u;
+                        advance = false;
+                    } else {
+                        loop_counters[d] = 0u;
+                        loop_depth = d;
+                    }
+                }
+            }
+            case CF_KIND_COND_JMP: {
+                let pred_bits = aux;
+                let is_pred  = (pred_bits & 1u) != 0u;
+                let pred_cnd = (pred_bits & 2u) != 0u;
+                if !is_pred || predicate == pred_cnd {
+                    cf_i = primary;
+                    advance = false;
+                }
+            }
+            case CF_KIND_COND_CALL, CF_KIND_RETURN: {
+                reject_mask |= REJECT_CF_CALL;
+            }
+            default: { reject_mask |= REJECT_CF_JUMP; }
+        }
+        if stop { break; }
+        if advance { cf_i = cf_i + 1u; }
+    }
+}
diff --git a/crates/xenia-gpu/src/texture_cache.rs b/crates/xenia-gpu/src/texture_cache.rs
new file mode 100644
index 0000000..cc343bc
--- /dev/null
+++ b/crates/xenia-gpu/src/texture_cache.rs
@@ -0,0 +1,970 @@
+//! Texture cache — P5.
+//!
+//! Two-layer design mirroring canary's `TextureCache`:
+//!
+//!  * **CPU layer** (this module): owns decoded, linear, host-endian texel
+//!    byte buffers keyed by [`TextureKey`]. `ensure_cached` consults the
+//!    guest memory's page-version counter to decide whether the cached
+//!    bytes are still fresh and re-decodes on miss or staleness.
+//!  * **GPU layer** (xenia-ui `texture_cache_host`): owns the
+//!    `wgpu::Texture` + `TextureView` for each cached key; pulls decoded
+//!    bytes from this CPU layer on upload.
+//!
+//! Canary references: `texture_cache.h/.cc`, `texture_info.cc`, and
+//! `texture_info_formats.inl` for the format table.
+
+use std::collections::HashMap;
+
+use crate::tiled_address;
+
+/// Xenos texture formats — `xenos::TextureFormat` at `xenos.h:489-579`.
+/// Values are the raw enum numbers the guest writes into
+/// `xe_gpu_texture_fetch_t.format`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[repr(u8)]
+pub enum TextureFormat {
+    K1Reverse = 0,
+    K1 = 1,
+    K8 = 2,
+    K1555 = 3,
+    K565 = 4,
+    K6_5_5 = 5,
+    K8888 = 6,
+    K1010102 = 7,
+    K8_8 = 10,
+    K4_4_4_4 = 15,
+    K10_11_11 = 16,
+    K11_11_10 = 17,
+    Dxt1 = 18,
+    Dxt2_3 = 19,
+    Dxt4_5 = 20,
+    K24_8 = 22,
+    K24_8Float = 23,
+    K16 = 24,
+    K16_16 = 25,
+    K16_16_16_16 = 26,
+    K16Float = 30,
+    K16_16Float = 31,
+    K16_16_16_16Float = 32,
+    K32 = 33,
+    K32_32 = 34,
+    K32_32_32_32 = 35,
+    K32Float = 36,
+    K32_32Float = 37,
+    K32_32_32_32Float = 38,
+    Unknown(u8),
+}
+
+impl TextureFormat {
+    pub fn from_raw(v: u8) -> Self {
+        use TextureFormat::*;
+        match v & 0x3F {
+            0 => K1Reverse,
+            1 => K1,
+            2 => K8,
+            3 => K1555,
+            4 => K565,
+            5 => K6_5_5,
+            6 => K8888,
+            7 => K1010102,
+            10 => K8_8,
+            15 => K4_4_4_4,
+            16 => K10_11_11,
+            17 => K11_11_10,
+            18 => Dxt1,
+            19 => Dxt2_3,
+            20 => Dxt4_5,
+            22 => K24_8,
+            23 => K24_8Float,
+            24 => K16,
+            25 => K16_16,
+            26 => K16_16_16_16,
+            30 => K16Float,
+            31 => K16_16Float,
+            32 => K16_16_16_16Float,
+            33 => K32,
+            34 => K32_32,
+            35 => K32_32_32_32,
+            36 => K32Float,
+            37 => K32_32Float,
+            38 => K32_32_32_32Float,
+            other => Unknown(other),
+        }
+    }
+
+    /// Block width/height in texels + bytes-per-block. For uncompressed
+    /// formats block_w = block_h = 1. For DXT formats block_w = block_h =
+    /// 4 (one 4×4 compressed block).
+    pub fn block_info(self) -> BlockInfo {
+        use TextureFormat::*;
+        match self {
+            K1Reverse | K1 => BlockInfo::new(1, 1, 1), // round up to 1 byte
+            K8 => BlockInfo::new(1, 1, 1),
+            K1555 | K565 | K6_5_5 | K4_4_4_4 | K16 | K16Float | K8_8 => BlockInfo::new(1, 1, 2),
+            K8888 | K1010102 | K10_11_11 | K11_11_10 | K24_8 | K24_8Float | K16_16
+            | K16_16Float | K32 | K32Float => BlockInfo::new(1, 1, 4),
+            K16_16_16_16 | K16_16_16_16Float | K32_32 | K32_32Float => BlockInfo::new(1, 1, 8),
+            K32_32_32_32 | K32_32_32_32Float => BlockInfo::new(1, 1, 16),
+            Dxt1 => BlockInfo::new(4, 4, 8),
+            Dxt2_3 | Dxt4_5 => BlockInfo::new(4, 4, 16),
+            Unknown(_) => BlockInfo::new(1, 1, 4), // safe-ish fallback
+        }
+    }
+
+    /// True iff this format lands on a wgpu texture format we can
+    /// natively bind — no CPU-side conversion per frame required. M5
+    /// adds `k_5_6_5` (CPU-expanded to `Rgba8Unorm` on decode; still
+    /// counts as supported for the host-cache wiring), `k_DXT2_3`
+    /// (BC2), and `k_DXT4_5` (BC3).
+    pub fn is_host_supported(self) -> bool {
+        matches!(
+            self,
+            TextureFormat::K8888
+                | TextureFormat::K565
+                | TextureFormat::Dxt1
+                | TextureFormat::Dxt2_3
+                | TextureFormat::Dxt4_5
+        )
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct BlockInfo {
+    pub block_w: u8,
+    pub block_h: u8,
+    pub bytes_per_block: u8,
+}
+
+impl BlockInfo {
+    pub const fn new(block_w: u8, block_h: u8, bytes_per_block: u8) -> Self {
+        Self {
+            block_w,
+            block_h,
+            bytes_per_block,
+        }
+    }
+    pub fn log2_bpb(self) -> u32 {
+        match self.bytes_per_block {
+            1 => 0,
+            2 => 1,
+            4 => 2,
+            8 => 3,
+            16 => 4,
+            _ => 0,
+        }
+    }
+}
+
+/// Xenos `Endian` enum from `xenos.h:198-204`. 2-bit field in fetch dword 1.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum Endian {
+    None = 0,
+    Swap8In16 = 1,
+    Swap8In32 = 2,
+    Swap16In32 = 3,
+}
+
+impl Endian {
+    pub fn from_raw(v: u8) -> Self {
+        match v & 0x3 {
+            1 => Endian::Swap8In16,
+            2 => Endian::Swap8In32,
+            3 => Endian::Swap16In32,
+            _ => Endian::None,
+        }
+    }
+
+    /// Apply this endian's byte swap to one 32-bit unit. Matches canary's
+    /// `shaders/endian.xesli:25-55` semantics; the WGSL translator pulls
+    /// the same mask-shift pattern.
+    pub fn swap32(self, v: u32) -> u32 {
+        match self {
+            Endian::None => v,
+            Endian::Swap8In16 => ((v & 0x00FF_00FF) << 8) | ((v & 0xFF00_FF00) >> 8),
+            Endian::Swap8In32 => v.swap_bytes(),
+            Endian::Swap16In32 => v.rotate_right(16),
+        }
+    }
+}
+
+/// Texture dimensionality (`xenos::DataDimension`).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum Dimension {
+    D1 = 0,
+    D2 = 1,
+    D3Stacked = 2,
+    Cube = 3,
+}
+
+impl Dimension {
+    pub fn from_raw(v: u8) -> Self {
+        match v & 0x3 {
+            1 => Dimension::D2,
+            2 => Dimension::D3Stacked,
+            3 => Dimension::Cube,
+            _ => Dimension::D1,
+        }
+    }
+}
+
+/// Identity of a cached texture. Matches canary's `TextureCache::TextureKey`
+/// at the semantic level — we exclude mip/border state for P5 since neither
+/// is populated yet.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TextureKey {
+    /// Guest physical base (byte address — already shifted left by 12 from
+    /// the fetch-constant `base_address` field).
+    pub base_address: u32,
+    pub width: u16,
+    pub height: u16,
+    pub depth_or_slices: u16,
+    pub format: TextureFormat,
+    pub endian: Endian,
+    pub dimension: Dimension,
+    pub tiled: bool,
+    /// Row pitch in texels, already aligned to 32. Canary stores pitch/32
+    /// in the fetch constant; we keep the raw texel count to avoid
+    /// callers remembering to shift.
+    pub pitch_texels: u16,
+}
+
+/// Decode a 6-dword texture fetch constant (layout at `xenos.h:1229-1329`).
+/// Returns `None` if the constant is obviously unset (all zeros) or if
+/// `type` is not the texture-constant marker.
+pub fn decode_fetch_constant(dwords: [u32; 6]) -> Option<TextureKey> {
+    let d0 = dwords[0];
+    let d1 = dwords[1];
+    let d2 = dwords[2];
+    let d5 = dwords[5];
+
+    // type: low 2 bits of dword 0 should be 2 (texture) per canary —
+    // 0 = vertex, 2 = texture. An all-zero constant reads as type 0 so
+    // `None` filters it out here.
+    let ty = d0 & 0x3;
+    if d0 == 0 && d1 == 0 {
+        return None;
+    }
+    // Not a texture constant (e.g. 0 = vertex fetch constant reused).
+    if ty != 2 {
+        return None;
+    }
+
+    let pitch_5 = (d0 >> 22) & 0x1FF; // pitch/32 in texels
+    let tiled = ((d0 >> 31) & 1) != 0;
+    let format = TextureFormat::from_raw((d1 & 0x3F) as u8);
+    let endian = Endian::from_raw(((d1 >> 6) & 0x3) as u8);
+    let base_address = (d1 >> 12) << 12; // base >> 12, re-shifted.
+    let dim = Dimension::from_raw(((d5 >> 9) & 0x3) as u8);
+
+    // Size decode depends on dimension.
+    let (width, height, depth) = match dim {
+        Dimension::D1 => ((d2 & 0x00FF_FFFF) as u16 + 1, 1u16, 1u16),
+        Dimension::D2 => (
+            (d2 & 0x1FFF) as u16 + 1,
+            ((d2 >> 13) & 0x1FFF) as u16 + 1,
+            ((d2 >> 26) & 0x3F) as u16 + 1,
+        ),
+        Dimension::D3Stacked | Dimension::Cube => (
+            (d2 & 0x7FF) as u16 + 1,
+            ((d2 >> 11) & 0x7FF) as u16 + 1,
+            ((d2 >> 22) & 0x3FF) as u16 + 1,
+        ),
+    };
+
+    Some(TextureKey {
+        base_address,
+        width,
+        height,
+        depth_or_slices: depth,
+        format,
+        endian,
+        dimension: dim,
+        tiled,
+        pitch_texels: ((pitch_5 as u16) * 32).max(width),
+    })
+}
+
+/// Decoded, linear, host-endian texture bytes ready for wgpu upload.
+#[derive(Debug, Clone)]
+pub struct CachedTexture {
+    pub key: TextureKey,
+    pub version_when_uploaded: u64,
+    /// Tightly packed. Layout depends on `key.format`:
+    /// - `K8888` → `width*height*4` bytes in Rgba8Unorm order.
+    /// - `Dxt1`  → `ceil(w/4)*ceil(h/4)*8` bytes of raw BC1 blocks, after
+    ///   block-level detile + dword-endian swap.
+    pub bytes: Vec<u8>,
+}
+
+impl CachedTexture {
+    pub fn byte_size(&self) -> usize {
+        self.bytes.len()
+    }
+}
+
+/// Errors that can happen during decode. The `ensure_cached` caller maps
+/// these to `gpu.texture.reject{reason}` metrics so the HUD surfaces when
+/// a texture fell back.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DecodeError {
+    UnsupportedFormat,
+    OutOfBounds,
+    ZeroSize,
+}
+
+/// Read `len` bytes from guest memory starting at `addr`. Returns `None`
+/// if the span would exceed the memory's reported end; otherwise returns
+/// a freshly-allocated buffer with the bytes.
+///
+/// The `MemoryAccess` trait exposes per-byte reads only; we batch them in
+/// a single pass to avoid the per-byte virtual dispatch overhead for large
+/// textures (1 MiB frontbuffer = 1M dispatch calls).
+pub fn read_guest_bytes(
+    mem: &dyn xenia_memory::MemoryAccess,
+    addr: u32,
+    len: usize,
+) -> Vec<u8> {
+    let mut out = Vec::with_capacity(len);
+    for i in 0..len {
+        let a = addr.wrapping_add(i as u32);
+        out.push(mem.read_u8(a));
+        if a < addr {
+            // 32-bit overflow; unmap the tail.
+            break;
+        }
+    }
+    out
+}
+
+/// Byte-swap the 32-bit dwords of `buf` in place according to `endian`.
+/// `buf.len()` should be a multiple of 4; tail bytes are left untouched.
+pub fn apply_endian_32(buf: &mut [u8], endian: Endian) {
+    if matches!(endian, Endian::None) {
+        return;
+    }
+    let mut i = 0;
+    while i + 4 <= buf.len() {
+        let v = u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]);
+        let swapped = endian.swap32(v);
+        buf[i..i + 4].copy_from_slice(&swapped.to_le_bytes());
+        i += 4;
+    }
+}
+
+/// Decode a k_8_8_8_8 texture out of guest memory into `Rgba8Unorm` bytes.
+/// Applies Xenos→host channel swizzle (Xbox 360 stores BGRA in memory →
+/// we emit RGBA for wgpu) and the declared endian swap, then detiles via
+/// the Xenos Tiled2D formula.
+pub fn decode_k8888_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    if key.width == 0 || key.height == 0 {
+        return Err(DecodeError::ZeroSize);
+    }
+    let w = key.width as u32;
+    let h = key.height as u32;
+    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
+    let total_bytes = (pitch_aligned * h * 4) as usize;
+    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
+    if raw.len() < total_bytes {
+        return Err(DecodeError::OutOfBounds);
+    }
+    apply_endian_32(&mut raw, key.endian);
+    let mut linear = vec![0u8; (w * h * 4) as usize];
+    if key.tiled {
+        if tiled_address::detile_2d(&raw, &mut linear, w, h, pitch_aligned, 4).is_err() {
+            return Err(DecodeError::OutOfBounds);
+        }
+    } else {
+        // Non-tiled copy row-by-row honoring pitch.
+        for y in 0..h as usize {
+            let src = y * (pitch_aligned as usize) * 4;
+            let dst = y * (w as usize) * 4;
+            linear[dst..dst + (w as usize) * 4]
+                .copy_from_slice(&raw[src..src + (w as usize) * 4]);
+        }
+    }
+    // Xenos stores `k_8_8_8_8` in ARGB byte order (high nibble = A). After
+    // endian.Swap8In32 guests' typical per-dword byte order becomes BGRA
+    // in little-endian host bytes. Swap B↔R so we hand Rgba8Unorm to wgpu.
+    for px in linear.chunks_exact_mut(4) {
+        px.swap(0, 2);
+    }
+    Ok(linear)
+}
+
+/// Decode a DXT-compressed texture to raw block bytes (no format
+/// conversion — wgpu understands `Bc{1,2,3}RgbaUnorm` natively so the
+/// GPU does the actual decompression on upload).
+///
+/// Xenos stores DXT blocks in 4×4 block-tiled order using the Tiled2D
+/// formula, with stride counted in blocks. `bytes_per_block` is 8 for
+/// BC1 (DXT1), 16 for BC2 (DXT2_3) and BC3 (DXT4_5).
+pub fn decode_dxt_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+    bytes_per_block: u32,
+) -> Result<Vec<u8>, DecodeError> {
+    if key.width == 0 || key.height == 0 {
+        return Err(DecodeError::ZeroSize);
+    }
+    let block_w = 4u32;
+    let block_h = 4u32;
+    let w_blocks = (key.width as u32).div_ceil(block_w);
+    let h_blocks = (key.height as u32).div_ceil(block_h);
+    let pitch_blocks = tiled_address::align_pitch_to_macro_tile(
+        (key.pitch_texels as u32).div_ceil(block_w),
+    );
+    let total_bytes = (pitch_blocks * h_blocks * bytes_per_block) as usize;
+    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
+    if raw.len() < total_bytes {
+        return Err(DecodeError::OutOfBounds);
+    }
+    // DXT blocks are stored as 4×u16 + 4×u8-indices (BC1) or similar
+    // u16/u32-width fields for BC2/BC3; the Xbox 360's big-endian word
+    // order requires an endian swap at the u16/u32 level regardless of
+    // which BC-family format.
+    apply_endian_32(&mut raw, key.endian);
+
+    let mut out = vec![0u8; (w_blocks * h_blocks * bytes_per_block) as usize];
+    if key.tiled {
+        if tiled_address::detile_2d(
+            &raw,
+            &mut out,
+            w_blocks,
+            h_blocks,
+            pitch_blocks,
+            bytes_per_block,
+        )
+        .is_err()
+        {
+            return Err(DecodeError::OutOfBounds);
+        }
+    } else {
+        for y in 0..h_blocks as usize {
+            let src = y * (pitch_blocks as usize) * (bytes_per_block as usize);
+            let dst = y * (w_blocks as usize) * (bytes_per_block as usize);
+            out[dst..dst + (w_blocks as usize) * (bytes_per_block as usize)]
+                .copy_from_slice(&raw[src..src + (w_blocks as usize) * (bytes_per_block as usize)]);
+        }
+    }
+    Ok(out)
+}
+
+/// BC1 / DXT1 — 8-byte blocks.
+pub fn decode_dxt1_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    decode_dxt_tiled(key, mem, 8)
+}
+
+/// BC2 / DXT2_3 — 16-byte blocks.
+pub fn decode_dxt23_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    decode_dxt_tiled(key, mem, 16)
+}
+
+/// BC3 / DXT4_5 — 16-byte blocks.
+pub fn decode_dxt45_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    decode_dxt_tiled(key, mem, 16)
+}
+
+/// **k_5_6_5** — 16-bit R:5 G:6 B:5 per texel (Xbox stores R in the high
+/// 5 bits of the 16-bit word). We unpack each texel into 4 bytes of
+/// `Rgba8Unorm` (A = 0xFF). wgpu doesn't ship `R5G6B5Unorm` as a
+/// sampled texture format on every backend, so CPU-side conversion is
+/// the safe path even if it's 2× the texture memory.
+///
+/// Tiling: Tiled2D at the **texel** level (block = 1 texel = 2 bytes),
+/// then we expand each 2-byte u16 into the 4-byte Rgba8 in the linear
+/// output buffer.
+pub fn decode_k565_tiled(
+    key: &TextureKey,
+    mem: &dyn xenia_memory::MemoryAccess,
+) -> Result<Vec<u8>, DecodeError> {
+    if key.width == 0 || key.height == 0 {
+        return Err(DecodeError::ZeroSize);
+    }
+    let w = key.width as u32;
+    let h = key.height as u32;
+    // Pitch/block counts — block = 1 texel here, 2 bytes.
+    let pitch_aligned = tiled_address::align_pitch_to_macro_tile(key.pitch_texels as u32);
+    let total_bytes = (pitch_aligned * h * 2) as usize;
+    let mut raw = read_guest_bytes(mem, key.base_address, total_bytes);
+    if raw.len() < total_bytes {
+        return Err(DecodeError::OutOfBounds);
+    }
+    // 16-bit word order is endian-swap-sensitive.
+    apply_endian_32(&mut raw, key.endian);
+    // Step 1: detile (bytes_per_block=2, tile in blocks=texels).
+    let mut linear_u16 = vec![0u8; (w * h * 2) as usize];
+    if key.tiled {
+        if tiled_address::detile_2d(&raw, &mut linear_u16, w, h, pitch_aligned, 2).is_err() {
+            return Err(DecodeError::OutOfBounds);
+        }
+    } else {
+        for y in 0..h as usize {
+            let src = y * (pitch_aligned as usize) * 2;
+            let dst = y * (w as usize) * 2;
+            linear_u16[dst..dst + (w as usize) * 2]
+                .copy_from_slice(&raw[src..src + (w as usize) * 2]);
+        }
+    }
+    // Step 2: expand each 16-bit RGB565 to Rgba8Unorm. The in-memory u16
+    // is little-endian after `apply_endian_32` has normalized the word
+    // order (we keep host-native byte ordering post-swap).
+    let mut rgba = vec![0u8; (w * h * 4) as usize];
+    for y in 0..h as usize {
+        for x in 0..w as usize {
+            let off = (y * w as usize + x) * 2;
+            let lo = linear_u16[off];
+            let hi = linear_u16[off + 1];
+            let word = u16::from_le_bytes([lo, hi]);
+            // 5 bits R (bits 11-15), 6 bits G (5-10), 5 bits B (0-4).
+            // Expand to full-range u8: replicate high bits into low
+            // (so 0b11111 → 0xFF, matching the standard 565→888 convention).
+            let r5 = ((word >> 11) & 0x1F) as u8;
+            let g6 = ((word >> 5) & 0x3F) as u8;
+            let b5 = (word & 0x1F) as u8;
+            let r = (r5 << 3) | (r5 >> 2);
+            let g = (g6 << 2) | (g6 >> 4);
+            let b = (b5 << 3) | (b5 >> 2);
+            let o = (y * w as usize + x) * 4;
+            rgba[o] = r;
+            rgba[o + 1] = g;
+            rgba[o + 2] = b;
+            rgba[o + 3] = 0xFF;
+        }
+    }
+    Ok(rgba)
+}
+
+/// Version-aware CPU-side texture cache. Entries are keyed on
+/// `TextureKey.hash` and carry a `version_when_uploaded` watermark against
+/// the guest memory's page-version counter. `ensure_cached` queries
+/// `GuestMemory::max_page_version` over the texture's byte span; if the
+/// span has been written since cache time, the entry is re-decoded.
+pub struct TextureCache {
+    entries: HashMap<TextureKey, CachedTexture>,
+    /// Monotonic counter of decodes performed — HUD surface.
+    pub decodes_total: u64,
+    /// Count of stale-miss re-decodes.
+    pub restale_total: u64,
+}
+
+impl Default for TextureCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TextureCache {
+    pub fn new() -> Self {
+        Self {
+            entries: HashMap::new(),
+            decodes_total: 0,
+            restale_total: 0,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.entries.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.entries.is_empty()
+    }
+
+    pub fn get(&self, key: &TextureKey) -> Option<&CachedTexture> {
+        self.entries.get(key)
+    }
+
+    /// Return a cached (or freshly-decoded) texture. The caller supplies
+    /// the current guest-memory page version covering the texture span;
+    /// see [`max_page_version_for`].
+    pub fn ensure_cached(
+        &mut self,
+        key: TextureKey,
+        current_version: u64,
+        mem: &dyn xenia_memory::MemoryAccess,
+    ) -> Result<&CachedTexture, DecodeError> {
+        // Fast path: fresh entry exists.
+        if let Some(e) = self.entries.get(&key) {
+            if e.version_when_uploaded >= current_version {
+                return Ok(self.entries.get(&key).unwrap());
+            }
+            self.restale_total += 1;
+        }
+        let bytes = match key.format {
+            TextureFormat::K8888 => decode_k8888_tiled(&key, mem)?,
+            TextureFormat::K565 => decode_k565_tiled(&key, mem)?,
+            TextureFormat::Dxt1 => decode_dxt1_tiled(&key, mem)?,
+            TextureFormat::Dxt2_3 => decode_dxt23_tiled(&key, mem)?,
+            TextureFormat::Dxt4_5 => decode_dxt45_tiled(&key, mem)?,
+            _ => return Err(DecodeError::UnsupportedFormat),
+        };
+        self.decodes_total += 1;
+        let entry = CachedTexture {
+            key,
+            version_when_uploaded: current_version,
+            bytes,
+        };
+        self.entries.insert(key, entry);
+        Ok(self.entries.get(&key).unwrap())
+    }
+
+    pub fn byte_budget(&self) -> usize {
+        self.entries.values().map(|e| e.byte_size()).sum()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::cell::Cell;
+
+    struct FakeMem(Box<[Cell<u8>]>);
+    impl FakeMem {
+        fn from_vec(v: Vec<u8>) -> Self {
+            FakeMem(v.into_iter().map(Cell::new).collect())
+        }
+    }
+    impl xenia_memory::MemoryAccess for FakeMem {
+        fn read_u8(&self, a: u32) -> u8 {
+            self.0.get(a as usize).map(|c| c.get()).unwrap_or(0)
+        }
+        fn read_u16(&self, a: u32) -> u16 {
+            u16::from_be_bytes([self.read_u8(a), self.read_u8(a + 1)])
+        }
+        fn read_u32(&self, a: u32) -> u32 {
+            u32::from_be_bytes([
+                self.read_u8(a),
+                self.read_u8(a + 1),
+                self.read_u8(a + 2),
+                self.read_u8(a + 3),
+            ])
+        }
+        fn read_u64(&self, a: u32) -> u64 {
+            u64::from_be_bytes([
+                self.read_u8(a),
+                self.read_u8(a + 1),
+                self.read_u8(a + 2),
+                self.read_u8(a + 3),
+                self.read_u8(a + 4),
+                self.read_u8(a + 5),
+                self.read_u8(a + 6),
+                self.read_u8(a + 7),
+            ])
+        }
+        fn write_u8(&self, a: u32, v: u8) {
+            if let Some(slot) = self.0.get(a as usize) {
+                slot.set(v);
+            }
+        }
+        fn write_u16(&self, a: u32, v: u16) {
+            let b = v.to_be_bytes();
+            self.write_u8(a, b[0]);
+            self.write_u8(a + 1, b[1]);
+        }
+        fn write_u32(&self, a: u32, v: u32) {
+            let b = v.to_be_bytes();
+            for i in 0..4 {
+                self.write_u8(a + i as u32, b[i]);
+            }
+        }
+        fn write_u64(&self, a: u32, v: u64) {
+            let b = v.to_be_bytes();
+            for i in 0..8 {
+                self.write_u8(a + i as u32, b[i]);
+            }
+        }
+        fn translate(&self, _: u32) -> Option<*const u8> {
+            None
+        }
+        fn translate_mut(&self, _: u32) -> Option<*mut u8> {
+            None
+        }
+    }
+
+    #[test]
+    fn format_block_info_matches_canary_expectations() {
+        assert_eq!(
+            TextureFormat::K8888.block_info(),
+            BlockInfo::new(1, 1, 4)
+        );
+        assert_eq!(TextureFormat::Dxt1.block_info(), BlockInfo::new(4, 4, 8));
+        assert_eq!(
+            TextureFormat::Dxt4_5.block_info(),
+            BlockInfo::new(4, 4, 16)
+        );
+    }
+
+    #[test]
+    fn endian_swap_variants() {
+        assert_eq!(Endian::None.swap32(0x11223344), 0x11223344);
+        assert_eq!(Endian::Swap8In16.swap32(0x11223344), 0x22114433);
+        assert_eq!(Endian::Swap8In32.swap32(0x11223344), 0x44332211);
+        assert_eq!(Endian::Swap16In32.swap32(0x11223344), 0x33441122);
+    }
+
+    #[test]
+    fn decode_fetch_constant_rejects_empty() {
+        let z = [0u32; 6];
+        assert!(decode_fetch_constant(z).is_none());
+    }
+
+    #[test]
+    fn decode_fetch_constant_parses_2d_k8888() {
+        // Build a synthetic k_8_8_8_8 2D texture fetch constant:
+        //   dword0: pitch_5=40 (1280/32), tiled=1, type=2
+        //   dword1: format=6 (K8888), endian=2 (Swap8In32), base=0xAB000>>12
+        //   dword2: width-1=1279, height-1=719
+        //   dword5: dimension=1 (2D)
+        let d0 = 0x8000_0000 | (40u32 << 22) | 2;
+        let d1 = (0xAB000u32 >> 12 << 12) | (2u32 << 6) | 6u32;
+        let d2 = 1279u32 | ((719u32) << 13);
+        let d5 = 1u32 << 9;
+        let k = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("parsed");
+        assert_eq!(k.format, TextureFormat::K8888);
+        assert_eq!(k.endian, Endian::Swap8In32);
+        assert_eq!(k.width, 1280);
+        assert_eq!(k.height, 720);
+        assert_eq!(k.dimension, Dimension::D2);
+        assert!(k.tiled);
+        assert_eq!(k.pitch_texels, 1280);
+    }
+
+    #[test]
+    fn decode_k8888_roundtrip_linear() {
+        // Build a 4×4 non-tiled image with pitch=32 (one macro-tile row).
+        // Each pixel at (x, y) stores ARGB = (0xFF, x, y, y*4+x) as a
+        // big-endian dword. After Swap8In32 + B↔R swizzle, out[off..] must
+        // be (x, y, y*4+x, 0xFF) in RGBA order.
+        let w = 4u32;
+        let h = 4u32;
+        let pitch = 32u32;
+        let mut bytes = vec![0u8; (pitch * h * 4) as usize];
+        for y in 0..h {
+            for x in 0..w {
+                let off = ((y * pitch + x) * 4) as usize;
+                let argb = (0xFFu32 << 24)
+                    | ((x as u32) << 16)
+                    | ((y as u32) << 8)
+                    | ((y * 4 + x) as u32);
+                bytes[off..off + 4].copy_from_slice(&argb.to_be_bytes());
+            }
+        }
+        let mem = FakeMem::from_vec(bytes);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::K8888,
+            endian: Endian::Swap8In32,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: pitch as u16,
+        };
+        let out = decode_k8888_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 16 * 4);
+        assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]);
+        let off = ((3 * 4 + 3) * 4) as usize;
+        assert_eq!(&out[off..off + 4], &[3, 3, 15, 0xFF]);
+    }
+
+    // ── First-Pixels M5 format tests ──────────────────────────────
+
+    /// BC2 (DXT2_3) roundtrip: 16-byte blocks, 4×4 image = 1 block.
+    /// Synthetic source of 0xDEADBEEF... bytes; assert the decoder
+    /// returns the same bytes (passthrough after endian swap).
+    #[test]
+    fn decode_dxt23_small_roundtrip() {
+        // 4×4 texture = 1 BC2 block (16 bytes). With pitch_texels=32
+        // (macro-tile-aligned) the block pitch is 8 (=32/4), and we
+        // allocate 8*1*16 = 128 bytes of source.
+        let mut bytes = vec![0u8; 128];
+        for (i, b) in bytes.iter_mut().enumerate().take(16) {
+            *b = i as u8;
+        }
+        let mem = FakeMem::from_vec(bytes);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::Dxt2_3,
+            endian: Endian::None, // no swap — we can eyeball passthrough
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        let out = decode_dxt23_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 16); // 1 block × 16 bytes
+        for i in 0..16 {
+            assert_eq!(out[i], i as u8);
+        }
+    }
+
+    /// BC3 (DXT4_5) uses the same 16-byte block infra as BC2; a
+    /// parallel test prevents a regression that sneaks up via the
+    /// generic `decode_dxt_tiled`.
+    #[test]
+    fn decode_dxt45_uses_16byte_blocks() {
+        let mem = FakeMem::from_vec(vec![0xAAu8; 256]);
+        let key = TextureKey {
+            base_address: 0,
+            width: 8,
+            height: 4, // 2×1 blocks
+            depth_or_slices: 1,
+            format: TextureFormat::Dxt4_5,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        let out = decode_dxt45_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 2 * 16);
+    }
+
+    /// k_5_6_5: a single white texel (all bits set, 0xFFFF) should
+    /// expand to RGBA8 white (0xFF, 0xFF, 0xFF, 0xFF). A single pure-red
+    /// texel (R=31, G=0, B=0 → word 0xF800) should expand to R=255 G=0
+    /// B=0 via the high-bit-replicate convention.
+    #[test]
+    fn decode_k565_texel_expansion() {
+        // Memory layout for a 2×1 non-tiled k_5_6_5 image (pitch=32 texels
+        // → 32 × 1 × 2 = 64 bytes). We store texel[0] = 0xFFFF (white),
+        // texel[1] = 0xF800 (pure red).
+        let mut bytes = vec![0u8; 64];
+        // 0xFFFF
+        bytes[0] = 0xFF;
+        bytes[1] = 0xFF;
+        // 0xF800 (big-endian memory): high byte 0xF8, low 0x00.
+        // But after apply_endian_32(Endian::None) we use little-endian
+        // word decoding — so memory must carry the bytes in LE order.
+        bytes[2] = 0x00;
+        bytes[3] = 0xF8;
+        let mem = FakeMem::from_vec(bytes);
+        let key = TextureKey {
+            base_address: 0,
+            width: 2,
+            height: 1,
+            depth_or_slices: 1,
+            format: TextureFormat::K565,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        let out = decode_k565_tiled(&key, &mem).expect("decode");
+        assert_eq!(out.len(), 2 * 4);
+        // Texel 0: white.
+        assert_eq!(&out[0..4], &[0xFF, 0xFF, 0xFF, 0xFF]);
+        // Texel 1: pure red via 5-bit-expand (0b11111 → 0xFF).
+        assert_eq!(&out[4..8], &[0xFF, 0x00, 0x00, 0xFF]);
+    }
+
+    #[test]
+    fn is_host_supported_covers_m5_formats() {
+        assert!(TextureFormat::K8888.is_host_supported());
+        assert!(TextureFormat::K565.is_host_supported());
+        assert!(TextureFormat::Dxt1.is_host_supported());
+        assert!(TextureFormat::Dxt2_3.is_host_supported());
+        assert!(TextureFormat::Dxt4_5.is_host_supported());
+        // Unsupported formats should still report false.
+        assert!(!TextureFormat::K16.is_host_supported());
+        assert!(!TextureFormat::K32Float.is_host_supported());
+    }
+
+    #[test]
+    fn texture_cache_caches_and_reuses() {
+        let mut cache = TextureCache::new();
+        let mem = FakeMem::from_vec(vec![0u8; 8 * 1024]);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::K8888,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        cache.ensure_cached(key, 0, &mem).unwrap();
+        assert_eq!(cache.decodes_total, 1);
+        // Same version: should hit cache.
+        cache.ensure_cached(key, 0, &mem).unwrap();
+        assert_eq!(cache.decodes_total, 1);
+        // Higher version: stale → re-decode.
+        cache.ensure_cached(key, 1, &mem).unwrap();
+        assert_eq!(cache.decodes_total, 2);
+        assert_eq!(cache.restale_total, 1);
+    }
+
+    /// End-to-end P5 test: a 6-dword fetch constant → decoded `TextureKey`
+    /// → `ensure_cached` on fresh/version-bumped memory → stale re-decode.
+    /// Mirrors what `vd_swap` does per frame.
+    #[test]
+    fn e2e_fetch_const_to_cache_with_versioning() {
+        // 4×4 k_8_8_8_8 2D tiled texture at base 0x100, pitch=32 aligned.
+        let d0 = 0x8000_0000u32 | (1u32 << 22) | 2; // pitch_5=1, tiled, type=2
+        let d1 = (0x100u32 >> 12 << 12) | (0u32 << 6) | 6; // K8888, endian=none
+        let d2 = 3u32 | (3u32 << 13); // width-1=3, height-1=3
+        let d5 = 1u32 << 9; // 2D
+        let key = decode_fetch_constant([d0, d1, d2, 0, 0, d5]).expect("decoded");
+        assert_eq!(key.format, TextureFormat::K8888);
+        assert_eq!(key.width, 4);
+
+        let mut mem = FakeMem::from_vec(vec![0xAAu8; 4 * 1024]);
+        let mut cache = TextureCache::new();
+        // v0 decode.
+        let first = cache
+            .ensure_cached(key, 0, &mem)
+            .expect("initial decode")
+            .clone();
+        // Same version → cache hit.
+        cache.ensure_cached(key, 0, &mem).expect("hit");
+        assert_eq!(cache.decodes_total, 1);
+        // Simulate the guest writing to the texture's pages: version bumps.
+        for b in &mem.0[..16] {
+            b.set(0xFF);
+        }
+        cache.ensure_cached(key, 1, &mem).expect("re-decode");
+        assert_eq!(cache.decodes_total, 2);
+        assert_eq!(cache.restale_total, 1);
+        // Bytes differ from v0 (proof the re-decode happened).
+        let second = cache.get(&key).unwrap();
+        assert_ne!(first.bytes, second.bytes);
+    }
+
+    #[test]
+    fn texture_cache_rejects_unsupported_format() {
+        let mut cache = TextureCache::new();
+        let mem = FakeMem::from_vec(vec![0u8; 1024]);
+        let key = TextureKey {
+            base_address: 0,
+            width: 4,
+            height: 4,
+            depth_or_slices: 1,
+            format: TextureFormat::K16,
+            endian: Endian::None,
+            dimension: Dimension::D2,
+            tiled: false,
+            pitch_texels: 32,
+        };
+        assert!(matches!(
+            cache.ensure_cached(key, 0, &mem),
+            Err(DecodeError::UnsupportedFormat)
+        ));
+    }
+}
diff --git a/crates/xenia-gpu/src/tiled_address.rs b/crates/xenia-gpu/src/tiled_address.rs
new file mode 100644
index 0000000..10dabfb
--- /dev/null
+++ b/crates/xenia-gpu/src/tiled_address.rs
@@ -0,0 +1,178 @@
+//! Xenos tiled-texture address formula (2D, Tiled2D layout).
+//!
+//! Port of `xenia-canary/src/xenia/gpu/texture_address.h:190-208` Tiled2D /
+//! `TiledCombine` helpers. The Xbox 360 GPU stores textures in a 32×32-block
+//! macro-tile pattern with bank+pipe interleave for its internal DRAM
+//! banks; this formula inverts that so we can read pixels out in linear
+//! order, given the tiled source buffer.
+//!
+//! We use this in two places during P4:
+//!  - `vd_swap` frontbuffer detile (1280×720 k_8_8_8_8 usually).
+//!  - Any place we need to read tiled guest memory into a host-linear
+//!    buffer for CPU-side conversion before upload.
+
+/// Tile size constants from canary.
+pub const MACRO_TILE_WIDTH_LOG2: u32 = 5; // 32 px
+pub const MACRO_TILE_HEIGHT_2D_LOG2: u32 = 5; // 32 px
+
+/// Canary's `TiledCombine` helper — reassembles the DRAM address from the
+/// outer-tile byte offset plus the bank/pipe/y-LSB interleave bits.
+#[inline]
+fn tiled_combine(outer_inner_bytes: u32, bank: u32, pipe: u32, y_lsb: u32) -> u32 {
+    (y_lsb << 4)
+        | (pipe << 6)
+        | (bank << 11)
+        | (outer_inner_bytes & 0b1111)
+        | (((outer_inner_bytes >> 4) & 0b1) << 5)
+        | (((outer_inner_bytes >> 5) & 0b111) << 8)
+        | ((outer_inner_bytes >> 8) << 12)
+}
+
+/// 2D tiled offset in bytes from (x, y) into a tiled surface with
+/// `pitch_aligned` pixel pitch (rounded to the macro-tile width) and
+/// `bytes_per_block_log2` bytes-per-element (e.g. 2 for RGBA8888 → 4-byte
+/// blocks → log2 = 2). Always non-negative for in-bounds inputs; returns
+/// `u32` rather than canary's signed `int` since our callers stay in
+/// unsigned arithmetic.
+///
+/// This is the canonical formula — do not simplify without re-reading
+/// `texture_address.h:190-208`; the bit-interleave cannot be expressed
+/// as a linear function.
+pub fn tiled_2d_offset(x: u32, y: u32, pitch_aligned: u32, bytes_per_block_log2: u32) -> u32 {
+    let macro_tile_cols = pitch_aligned >> MACRO_TILE_WIDTH_LOG2;
+    // Outer: which 32×32 macro tile we're in.
+    let outer_blocks = (((y >> MACRO_TILE_HEIGHT_2D_LOG2) * macro_tile_cols)
+        + (x >> MACRO_TILE_WIDTH_LOG2))
+        << 6;
+    // Inner: where we are within the 32×32 macro tile (Y dropped by 1 bit
+    // because that bit becomes the `y_lsb` interleave bit below).
+    let inner_blocks = (((y >> 1) & 0b111) << 3) | (x & 0b111);
+    let outer_inner_bytes = (outer_blocks | inner_blocks) << bytes_per_block_log2;
+
+    let bank = (y >> 4) & 0b1;
+    let pipe = ((x >> 3) & 0b11) ^ (((y >> 3) & 0b1) << 1);
+    let y_lsb = y & 1;
+
+    tiled_combine(outer_inner_bytes, bank, pipe, y_lsb)
+}
+
+/// Round `pitch_pixels` up to the nearest multiple of the macro-tile width
+/// (32 px). Xenos requires tile-aligned pitches; non-aligned values pad.
+#[inline]
+pub fn align_pitch_to_macro_tile(pitch_pixels: u32) -> u32 {
+    let mask = (1u32 << MACRO_TILE_WIDTH_LOG2) - 1;
+    (pitch_pixels + mask) & !mask
+}
+
+/// Detile a 2D tiled surface into a linear destination buffer. The
+/// closure `block_bytes(src_offset) -> &[u8]` returns a slice pointing at
+/// one block in the tiled source, and the detiler writes it into `dst`
+/// at the linear (x, y) position.
+///
+/// `bpp` is the bytes-per-block (4 for RGBA8888, 1 for a1r5g5b5 stored as
+/// a single 16-bit block, etc.). `dst` must be at least
+/// `width * height * bpp` bytes long.
+///
+/// Returns `Err(())` if the source doesn't contain enough bytes for the
+/// largest offset the formula would produce (defensive — callers can
+/// downgrade silently).
+pub fn detile_2d(
+    src: &[u8],
+    dst: &mut [u8],
+    width: u32,
+    height: u32,
+    pitch_pixels: u32,
+    bpp: u32,
+) -> Result<(), ()> {
+    let bpp_log2 = bpp.trailing_zeros();
+    let pitch_aligned = align_pitch_to_macro_tile(pitch_pixels);
+    let dst_pitch_bytes = (width * bpp) as usize;
+    let bpp_u = bpp as usize;
+
+    for y in 0..height {
+        for x in 0..width {
+            let src_off = tiled_2d_offset(x, y, pitch_aligned, bpp_log2) as usize;
+            if src_off + bpp_u > src.len() {
+                return Err(());
+            }
+            let dst_off = (y as usize) * dst_pitch_bytes + (x as usize) * bpp_u;
+            if dst_off + bpp_u > dst.len() {
+                return Err(());
+            }
+            dst[dst_off..dst_off + bpp_u].copy_from_slice(&src[src_off..src_off + bpp_u]);
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// The (0, 0) pixel is always at byte offset 0 regardless of pitch.
+    #[test]
+    fn origin_is_zero() {
+        assert_eq!(tiled_2d_offset(0, 0, 64, 2), 0);
+    }
+
+    /// Round-trip: detiling a tiled buffer that was filled using the same
+    /// formula produces the identity linear image.
+    #[test]
+    fn roundtrip_small_pattern() {
+        let w = 32u32;
+        let h = 16u32;
+        let bpp = 4u32;
+        let pitch = align_pitch_to_macro_tile(w);
+        // Allocate a tiled buffer large enough for the largest offset.
+        let max_off = (0..h)
+            .flat_map(|y| (0..w).map(move |x| tiled_2d_offset(x, y, pitch, 2) as usize + 4))
+            .max()
+            .unwrap();
+        let mut tiled = vec![0u8; max_off];
+        // Write a recognisable 4-byte pattern = (x, y, x^y, 0xFF) into
+        // each logical (x, y) position in the tiled buffer.
+        for y in 0..h {
+            for x in 0..w {
+                let off = tiled_2d_offset(x, y, pitch, 2) as usize;
+                tiled[off + 0] = x as u8;
+                tiled[off + 1] = y as u8;
+                tiled[off + 2] = (x ^ y) as u8;
+                tiled[off + 3] = 0xFF;
+            }
+        }
+        let mut linear = vec![0u8; (w * h * bpp) as usize];
+        detile_2d(&tiled, &mut linear, w, h, pitch, bpp).expect("detile ok");
+        // Verify every logical pixel landed at the right linear offset.
+        for y in 0..h {
+            for x in 0..w {
+                let lin = ((y * w + x) * bpp) as usize;
+                assert_eq!(linear[lin + 0], x as u8);
+                assert_eq!(linear[lin + 1], y as u8);
+                assert_eq!(linear[lin + 2], (x ^ y) as u8);
+                assert_eq!(linear[lin + 3], 0xFF);
+            }
+        }
+    }
+
+    /// Within a single macro-tile row, stepping `x` by 1 changes the low
+    /// 3 bits of `x` which feed the `inner_blocks` field — different
+    /// offsets are expected (no aliasing).
+    #[test]
+    fn neighbouring_pixels_have_distinct_offsets() {
+        let mut seen = std::collections::HashSet::new();
+        for y in 0..16 {
+            for x in 0..32 {
+                assert!(seen.insert(tiled_2d_offset(x, y, 32, 2)));
+            }
+        }
+    }
+
+    /// Pitch alignment is a power-of-two rounding: 1280 stays 1280, 1281
+    /// rounds to 1312.
+    #[test]
+    fn align_pitch_rounds_up_to_32() {
+        assert_eq!(align_pitch_to_macro_tile(1280), 1280);
+        assert_eq!(align_pitch_to_macro_tile(1281), 1312);
+        assert_eq!(align_pitch_to_macro_tile(31), 32);
+    }
+}
diff --git a/crates/xenia-gpu/src/translator.rs b/crates/xenia-gpu/src/translator.rs
new file mode 100644
index 0000000..9a8d8c1
--- /dev/null
+++ b/crates/xenia-gpu/src/translator.rs
@@ -0,0 +1,557 @@
+//! Xenos → WGSL direct translator (P7).
+//!
+//! Replaces the runtime uber-shader interpreter (P3b/P3c) for shaders whose
+//! feature set we cover. Emits a *standalone* WGSL module per shader
+//! instead of walking a ucode buffer at draw time — pipeline compilation
+//! happens once, then every subsequent dispatch is a direct `draw()`.
+//!
+//! The translator is deliberately narrow: when it encounters an opcode /
+//! fetch format / CF shape it doesn't know, it returns [`None`] and the
+//! caller falls back to the interpreter. This keeps the op-coverage work
+//! incremental — future commits can add one opcode at a time without
+//! invalidating the scaffolding.
+//!
+//! Current coverage (v1):
+//!  * Linear CF: `Exec`/`ExecEnd`, `Alloc`, `Exit`. No loops / branches /
+//!    calls / predicate-gated clauses.
+//!  * ALU vector: `ADD`, `MUL`, `MAX`, `MIN`, `MAD`, `DP4`, `DP3`,
+//!    `DP2_ADD`, `SEQ`, `SGT`, `SGE`, `SNE`, `FRC`, `FLOOR`.
+//!  * ALU scalar: `ADDS`, `MULS`, `MAXS`, `MINS`, `RCP`, `RETAIN_PREV`.
+//!  * Vertex fetch: `R32G32B32A32_FLOAT` only.
+//!  * Texture fetch: 2D via the single `@group(1)` slot (same one P5/M6
+//!    binds).
+//!  * Exports: VS writes position + interpolator 0 (color); PS writes
+//!    color0.
+//!
+//! When a shader exceeds this subset, [`translate`] returns `None` and
+//! `gpu.shader.translate_reject{reason}` is bumped by the caller.
+
+use crate::ucode::alu::{decode_alu, sop, vop, AluInstruction};
+use crate::ucode::control_flow::{AllocKind, ControlFlowInstruction};
+use crate::ucode::fetch::{decode_fetch, FetchInstruction};
+use crate::ucode::ParsedShader;
+
+/// Shader stage we're emitting for.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Stage {
+    Vertex,
+    Pixel,
+}
+
+/// Success or refusal from the translator. On refusal, the caller falls
+/// back to the runtime uber-shader interpreter.
+#[derive(Debug)]
+pub enum Translation {
+    /// The emitted WGSL body for *this stage only*. Both VS + PS get
+    /// wrapped into one module via [`combine_stages`].
+    Ok(String),
+    /// Translator saw an op/pattern it doesn't handle; fallback.
+    Reject(&'static str),
+}
+
+/// Full WGSL module for a (VS, PS) pair ready to hand to
+/// `wgpu::Device::create_shader_module`. Shares the header across the two
+/// bodies so bindings, struct declarations, and helpers aren't duplicated.
+pub fn combine_stages(vs_body: &str, ps_body: &str) -> String {
+    let mut out = String::with_capacity(4096 + vs_body.len() + ps_body.len());
+    out.push_str(MODULE_HEADER);
+    out.push_str(vs_body);
+    out.push_str(ps_body);
+    out
+}
+
+/// Translate a single shader stage. Returns `None` on any unsupported
+/// feature with a short reason string that the caller plumbs into the
+/// `gpu.shader.translate_reject{reason}` metric.
+pub fn translate(parsed: &ParsedShader, stage: Stage) -> Translation {
+    let mut ctx = EmitCtx::new(stage);
+    // Emit the stage entry function body.
+    if let Err(reason) = ctx.emit_stage_body(parsed) {
+        return Translation::Reject(reason);
+    }
+    Translation::Ok(ctx.finish())
+}
+
+/// Reject reasons; kept as static &'str for zero-alloc metrics.
+pub mod reject {
+    pub const VEC_OP_UNSUPPORTED: &str = "vec_op_unsupported";
+    pub const SCL_OP_UNSUPPORTED: &str = "scl_op_unsupported";
+    pub const CF_LOOP: &str = "cf_loop";
+    pub const CF_COND: &str = "cf_cond";
+    pub const CF_CALL: &str = "cf_call";
+    pub const CF_UNKNOWN: &str = "cf_unknown";
+    pub const VFETCH_FMT: &str = "vfetch_fmt";
+    pub const TFETCH_NON2D: &str = "tfetch_non2d";
+    pub const INSTR_OOB: &str = "instr_oob";
+}
+
+/// Shader-module preamble (bindings, helpers, struct defs). The bindings
+/// mirror the xenos pipeline's `@group(0)` + `@group(1)` layout from P5/M6
+/// so we can use **the same bind-group slots** — only the pipeline object
+/// differs between interpreter mode and translator mode.
+const MODULE_HEADER: &str = r#"
+struct XenosDrawConstants {
+    draw_index: u32,
+    vertex_count: u32,
+    prim_kind: u32,
+    _pad: u32,
+};
+
+struct XenosConstants {
+    alu:         array<vec4<f32>, 512>,
+    fetch:       array<u32, 256>,
+    bool_consts: array<u32, 8>,
+    loop_consts: array<u32, 32>,
+};
+
+@group(0) @binding(0) var<uniform>       draw_ctx      : XenosDrawConstants;
+@group(0) @binding(1) var<storage, read> xenos_consts  : XenosConstants;
+@group(0) @binding(2) var<storage, read> vs_ucode      : array<u32>;
+@group(0) @binding(3) var<storage, read> ps_ucode      : array<u32>;
+@group(0) @binding(4) var<storage, read> vertex_buffer : array<u32>;
+
+@group(1) @binding(0) var xenos_tex  : texture_2d<f32>;
+@group(1) @binding(1) var xenos_samp : sampler;
+
+struct VsOut {
+    @builtin(position) position: vec4<f32>,
+    @location(0) color: vec4<f32>,
+};
+
+struct FsOut {
+    @location(0) color0: vec4<f32>,
+};
+
+// Helper: reciprocal guarded against divide-by-zero.
+fn xe_rcp(x: f32) -> f32 {
+    return select(0.0, 1.0 / x, x != 0.0);
+}
+"#;
+
+struct EmitCtx {
+    stage: Stage,
+    out: String,
+    indent: usize,
+}
+
+impl EmitCtx {
+    fn new(stage: Stage) -> Self {
+        Self {
+            stage,
+            out: String::with_capacity(2048),
+            indent: 0,
+        }
+    }
+
+    fn finish(self) -> String {
+        self.out
+    }
+
+    fn push(&mut self, s: &str) {
+        for _ in 0..self.indent {
+            self.out.push_str("    ");
+        }
+        self.out.push_str(s);
+        self.out.push('\n');
+    }
+
+    fn emit_stage_body(&mut self, parsed: &ParsedShader) -> Result<(), &'static str> {
+        // Entry function + struct header.
+        match self.stage {
+            Stage::Vertex => {
+                self.push("@vertex");
+                self.push("fn vs_main(@builtin(vertex_index) vidx: u32) -> VsOut {");
+            }
+            Stage::Pixel => {
+                self.push("@fragment");
+                self.push("fn fs_main(in: VsOut) -> FsOut {");
+            }
+        }
+        self.indent = 1;
+        // Register file + ps chain + export slots. All local `var`s so each
+        // invocation gets its own state; translator-emitted code doesn't
+        // need `var<private>` because we don't share across function calls.
+        self.push("var r: array<vec4<f32>, 128>;");
+        self.push("for (var i = 0u; i < 128u; i = i + 1u) { r[i] = vec4<f32>(0.0); }");
+        self.push("var ps: f32 = 0.0;");
+        match self.stage {
+            Stage::Vertex => {
+                // Seed r0 with vertex index for simple shaders that read it.
+                self.push("r[0] = vec4<f32>(f32(vidx), 0.0, 0.0, 1.0);");
+                // Synthetic export slots — match the interpreter's layout so
+                // the fallback path and translator path produce the same
+                // visual output on shaders both support.
+                self.push("var opos: vec4<f32> = vec4<f32>(0.0, 0.0, 0.0, 1.0);");
+                self.push("var ocolor: vec4<f32> = vec4<f32>(1.0, 1.0, 1.0, 1.0);");
+            }
+            Stage::Pixel => {
+                // Seed r0.xy with interpolated color lane so trivial shaders
+                // that read r0 still produce something.
+                self.push("r[0] = in.color;");
+                self.push("var ocolor0: vec4<f32> = in.color;");
+            }
+        }
+
+        let mut current_alloc = AllocKind::Other;
+        for clause in &parsed.cf {
+            match clause {
+                ControlFlowInstruction::Exec {
+                    address,
+                    count,
+                    sequence,
+                    is_end,
+                    predicated,
+                    ..
+                } => {
+                    if *predicated {
+                        return Err(reject::CF_COND);
+                    }
+                    self.emit_exec(parsed, *address, *count, *sequence, current_alloc)?;
+                    if *is_end {
+                        break;
+                    }
+                }
+                ControlFlowInstruction::Alloc { kind, .. } => {
+                    current_alloc = *kind;
+                }
+                ControlFlowInstruction::Exit => break,
+                ControlFlowInstruction::LoopStart { .. }
+                | ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
+                ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
+                ControlFlowInstruction::CondCall { .. } | ControlFlowInstruction::Return => {
+                    return Err(reject::CF_CALL);
+                }
+                ControlFlowInstruction::Unknown { .. } => return Err(reject::CF_UNKNOWN),
+            }
+        }
+
+        match self.stage {
+            Stage::Vertex => {
+                self.push("var out: VsOut;");
+                self.push("out.position = opos;");
+                self.push("out.color = ocolor;");
+                self.push("return out;");
+            }
+            Stage::Pixel => {
+                self.push("var out: FsOut;");
+                self.push("out.color0 = ocolor0;");
+                self.push("return out;");
+            }
+        }
+        self.indent = 0;
+        self.push("}");
+        Ok(())
+    }
+
+    fn emit_exec(
+        &mut self,
+        parsed: &ParsedShader,
+        address: u32,
+        count: u32,
+        sequence: u32,
+        current_alloc: AllocKind,
+    ) -> Result<(), &'static str> {
+        for i in 0..(count as usize) {
+            let triple_idx = address as usize + i;
+            let base = triple_idx * 3;
+            if base + 2 >= parsed.instructions.len() {
+                return Err(reject::INSTR_OOB);
+            }
+            let words = [
+                parsed.instructions[base],
+                parsed.instructions[base + 1],
+                parsed.instructions[base + 2],
+            ];
+            let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
+            if is_fetch {
+                match decode_fetch(words) {
+                    FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
+                    FetchInstruction::Texture(tf) => {
+                        if tf.dimension != 1 {
+                            return Err(reject::TFETCH_NON2D);
+                        }
+                        self.emit_tfetch(&tf);
+                    }
+                    FetchInstruction::Unknown { .. } => return Err(reject::VFETCH_FMT),
+                }
+            } else {
+                let alu = decode_alu(words);
+                self.emit_alu(&alu, current_alloc)?;
+            }
+        }
+        Ok(())
+    }
+
+    fn emit_alu(
+        &mut self,
+        alu: &AluInstruction,
+        current_alloc: AllocKind,
+    ) -> Result<(), &'static str> {
+        let a = format!("r[{}u]", alu.src_a & 0x7F);
+        let b = format!("r[{}u]", alu.src_b & 0x7F);
+        let c = format!("r[{}u]", alu.src_c & 0x7F);
+
+        // Vector pipe.
+        if alu.vector_write_mask != 0 {
+            let expr = vector_expr(alu.vector_opcode, &a, &b, &c)
+                .ok_or(reject::VEC_OP_UNSUPPORTED)?;
+            let dst_reg = alu.vector_dest & 0x7F;
+            if alu.vector_dest_is_export {
+                self.emit_export(dst_reg, current_alloc, &expr, alu.vector_write_mask);
+            } else {
+                self.emit_masked_write(&format!("r[{dst_reg}u]"), &expr, alu.vector_write_mask);
+            }
+        }
+
+        // Scalar pipe. Binary ops use (src_a.x, src_b.x); ps-variants use
+        // src_a.x + running ps. `scl_src_a` mirrors the interpreter's
+        // `scalar_src_is_ps` selector.
+        let scl_src_a = if alu.scalar_src_is_ps {
+            "ps".to_string()
+        } else {
+            format!("{}.x", a)
+        };
+        let scl_src_b = format!("{}.x", b);
+        let expr = scalar_expr(alu.scalar_opcode, &scl_src_a, &scl_src_b, "ps")
+            .ok_or(reject::SCL_OP_UNSUPPORTED)?;
+        self.push(&format!("ps = {expr};"));
+        if alu.scalar_write_mask != 0 {
+            let v = "vec4<f32>(ps, ps, ps, ps)";
+            let dst_reg = alu.scalar_dest & 0x7F;
+            self.emit_masked_write(&format!("r[{dst_reg}u]"), v, alu.scalar_write_mask);
+        }
+        Ok(())
+    }
+
+    fn emit_masked_write(&mut self, lhs: &str, rhs: &str, mask: u8) {
+        if mask == 0xF {
+            self.push(&format!("{lhs} = {rhs};"));
+            return;
+        }
+        self.push(&"{".to_string());
+        self.indent += 1;
+        self.push(&format!("let _prev = {lhs};"));
+        self.push(&format!("let _new = {rhs};"));
+        let mut components = Vec::new();
+        let letters = ['x', 'y', 'z', 'w'];
+        for (i, c) in letters.iter().enumerate() {
+            if (mask >> i) & 1 == 1 {
+                components.push(format!("_new.{c}"));
+            } else {
+                components.push(format!("_prev.{c}"));
+            }
+        }
+        self.push(&format!(
+            "{lhs} = vec4<f32>({}, {}, {}, {});",
+            components[0], components[1], components[2], components[3]
+        ));
+        self.indent -= 1;
+        self.push("}");
+    }
+
+    fn emit_export(&mut self, dst_reg: u8, alloc: AllocKind, expr: &str, mask: u8) {
+        // Xenos's export "register" indexing within an alloc range is
+        // normally (alloc_base + offset). Since our CF stream doesn't
+        // carry per-export slot offsets cleanly, use `alloc` to pick the
+        // target.
+        let lhs = match (self.stage, alloc) {
+            (Stage::Vertex, AllocKind::Position) => "opos",
+            (Stage::Vertex, AllocKind::Interpolators) => "ocolor",
+            (Stage::Vertex, AllocKind::Colors) => "ocolor",
+            (Stage::Vertex, _) => "ocolor", // fall through — any other alloc
+            (Stage::Pixel, AllocKind::Colors) => "ocolor0",
+            (Stage::Pixel, _) => "ocolor0",
+        };
+        let _ = dst_reg; // per-slot export indexing reserved for a richer v2
+        self.emit_masked_write(lhs, expr, mask);
+    }
+
+    fn emit_vfetch(&mut self, vf: &crate::ucode::fetch::VertexFetch) -> Result<(), &'static str> {
+        // v1: treat all vertex fetches as R32G32B32A32_FLOAT, stride = 4
+        // dwords. Matches the interpreter's MVP semantics; unlocks more
+        // formats alongside the CPU texture cache's format expansion.
+        let fetch_const = (vf.raw[0] >> 5) & 0x1F;
+        let src_reg = vf.src_register & 0x7F;
+        let dst_reg = vf.dest_register & 0x7F;
+        self.push(&format!(
+            "{{ let fc0 = xenos_consts.fetch[{}u]; \
+             let base = (fc0 & 0xFFFFFFFCu) >> 2u; \
+             let vidx = u32(r[{src_reg}u].x); \
+             let addr = base + vidx * 4u; \
+             let n = arrayLength(&vertex_buffer); \
+             if (addr + 3u < n) {{ \
+                 r[{dst_reg}u] = vec4<f32>( \
+                     bitcast<f32>(vertex_buffer[addr + 0u]), \
+                     bitcast<f32>(vertex_buffer[addr + 1u]), \
+                     bitcast<f32>(vertex_buffer[addr + 2u]), \
+                     bitcast<f32>(vertex_buffer[addr + 3u])); \
+             }} }}",
+            fetch_const * 2,
+        ));
+        Ok(())
+    }
+
+    fn emit_tfetch(&mut self, tf: &crate::ucode::fetch::TextureFetch) {
+        // v1: sample the single bound texture; UV = r[src].xy. P5's cache
+        // publishes the `fetch_const=0` texture into `@group(1)`; slot
+        // mismatch is a silent magenta for now.
+        let src_reg = tf.src_register & 0x7F;
+        let dst_reg = tf.dest_register & 0x7F;
+        self.push(&format!(
+            "r[{dst_reg}u] = textureSampleLevel(xenos_tex, xenos_samp, r[{src_reg}u].xy, 0.0);"
+        ));
+    }
+}
+
+fn vector_expr(op: u8, a: &str, b: &str, c: &str) -> Option<String> {
+    let s = match op {
+        vop::ADD => format!("({a} + {b})"),
+        vop::MUL => format!("({a} * {b})"),
+        vop::MAX => format!("max({a}, {b})"),
+        vop::MIN => format!("min({a}, {b})"),
+        vop::MAD => format!("({a} * {b} + {c})"),
+        vop::DOT4 => format!("vec4<f32>(dot({a}, {b}))"),
+        vop::DOT3 => format!("vec4<f32>(dot({a}.xyz, {b}.xyz))"),
+        vop::DOT2_ADD => format!(
+            "vec4<f32>({a}.x * {b}.x + {a}.y * {b}.y + {c}.x)"
+        ),
+        vop::SEQ => format!(
+            "vec4<f32>(select(0.0,1.0,{a}.x=={b}.x), select(0.0,1.0,{a}.y=={b}.y), select(0.0,1.0,{a}.z=={b}.z), select(0.0,1.0,{a}.w=={b}.w))"
+        ),
+        vop::SGT => format!(
+            "vec4<f32>(select(0.0,1.0,{a}.x>{b}.x), select(0.0,1.0,{a}.y>{b}.y), select(0.0,1.0,{a}.z>{b}.z), select(0.0,1.0,{a}.w>{b}.w))"
+        ),
+        vop::SGE => format!(
+            "vec4<f32>(select(0.0,1.0,{a}.x>={b}.x), select(0.0,1.0,{a}.y>={b}.y), select(0.0,1.0,{a}.z>={b}.z), select(0.0,1.0,{a}.w>={b}.w))"
+        ),
+        vop::SNE => format!(
+            "vec4<f32>(select(0.0,1.0,{a}.x!={b}.x), select(0.0,1.0,{a}.y!={b}.y), select(0.0,1.0,{a}.z!={b}.z), select(0.0,1.0,{a}.w!={b}.w))"
+        ),
+        vop::FRC => format!("fract({a})"),
+        vop::FLOOR => format!("floor({a})"),
+        _ => return None,
+    };
+    Some(s)
+}
+
+fn scalar_expr(op: u8, a: &str, b: &str, prev: &str) -> Option<String> {
+    let s = match op {
+        sop::ADDS => format!("({a} + {b})"),
+        sop::ADDS_PREV => format!("({a} + {prev})"),
+        sop::MULS => format!("({a} * {b})"),
+        sop::MULS_PREV => format!("({a} * {prev})"),
+        sop::MAXS => format!("max({a}, {b})"),
+        sop::MINS => format!("min({a}, {b})"),
+        sop::RCP => format!("xe_rcp({a})"),
+        sop::RETAIN_PREV => prev.to_string(),
+        _ => return None,
+    };
+    Some(s)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ucode::alu::{sop, vop};
+    use crate::ucode::control_flow::ControlFlowInstruction;
+
+    fn synthetic_trivial_shader() -> ParsedShader {
+        // Single Exec clause: ALU add r0 = r0 + r0; scalar_op = RETAIN_PREV
+        // with full write-mask on vector, zero on scalar. Alloc(Position)
+        // precedes so the ALU's export (if it were one) would target oPos.
+        let w2 = (vop::ADD as u32)
+            | ((sop::RETAIN_PREV as u32) << 6)
+            | (0xF << 12) // vector_write_mask
+            | (0u32 << 16); // vector_dest = 0
+        ParsedShader {
+            cf: vec![
+                ControlFlowInstruction::Alloc {
+                    size: 1,
+                    kind: AllocKind::Position,
+                },
+                ControlFlowInstruction::Exec {
+                    address: 0,
+                    count: 1,
+                    sequence: 0,
+                    is_end: true,
+                    predicated: false,
+                    predicate_condition: false,
+                },
+            ],
+            instructions: vec![0, 0, w2],
+        }
+    }
+
+    #[test]
+    fn trivial_shader_translates() {
+        let shader = synthetic_trivial_shader();
+        match translate(&shader, Stage::Vertex) {
+            Translation::Ok(body) => {
+                assert!(body.contains("fn vs_main"));
+                assert!(body.contains("r[0u] = (r[0u] + r[0u]);"));
+                assert!(body.contains("return out;"));
+            }
+            Translation::Reject(r) => panic!("rejected: {r}"),
+        }
+    }
+
+    #[test]
+    fn combined_module_parses_as_wgsl() {
+        let shader = synthetic_trivial_shader();
+        let vs = match translate(&shader, Stage::Vertex) {
+            Translation::Ok(body) => body,
+            Translation::Reject(r) => panic!("VS rejected: {r}"),
+        };
+        let ps = match translate(&shader, Stage::Pixel) {
+            Translation::Ok(body) => body,
+            Translation::Reject(r) => panic!("PS rejected: {r}"),
+        };
+        let module = combine_stages(&vs, &ps);
+        // naga is pinned as a dev-dep in this crate; if this fails the
+        // translator is emitting invalid WGSL.
+        match naga::front::wgsl::parse_str(&module) {
+            Ok(_) => {}
+            Err(e) => panic!(
+                "emitted WGSL failed to parse:\n{}\n--- module ---\n{}",
+                e, module
+            ),
+        }
+    }
+
+    #[test]
+    fn loop_clause_rejected() {
+        let shader = ParsedShader {
+            cf: vec![ControlFlowInstruction::LoopStart {
+                address: 0,
+                loop_id: 0,
+            }],
+            instructions: vec![],
+        };
+        assert!(matches!(
+            translate(&shader, Stage::Vertex),
+            Translation::Reject(reject::CF_LOOP)
+        ));
+    }
+
+    #[test]
+    fn unsupported_op_rejected() {
+        let w2 = (29u32) // VOP_MAX_A, not in v1 subset
+            | ((sop::RETAIN_PREV as u32) << 6)
+            | (0xF << 12);
+        let shader = ParsedShader {
+            cf: vec![ControlFlowInstruction::Exec {
+                address: 0,
+                count: 1,
+                sequence: 0,
+                is_end: true,
+                predicated: false,
+                predicate_condition: false,
+            }],
+            instructions: vec![0, 0, w2],
+        };
+        assert!(matches!(
+            translate(&shader, Stage::Vertex),
+            Translation::Reject(reject::VEC_OP_UNSUPPORTED)
+        ));
+    }
+}
diff --git a/crates/xenia-gpu/src/ucode/alu.rs b/crates/xenia-gpu/src/ucode/alu.rs
new file mode 100644
index 0000000..4130a29
--- /dev/null
+++ b/crates/xenia-gpu/src/ucode/alu.rs
@@ -0,0 +1,206 @@
+//! Xenos ALU (vector + scalar) instruction decoder.
+//!
+//! An ALU instruction is 96 bits = 3 dwords. The three dwords encode:
+//!   - word0: operand modifier flags + destination info
+//!   - word1: source register / swizzle fields
+//!   - word2: opcode + write mask + export target
+//!
+//! See `ucode.h:900-1400` for the full field map. This decoder captures the
+//! minimal shape the uber-shader needs; flags we don't interpret yet are
+//! retained as raw bits in `raw` for downstream inspection.
+
+/// Decoded ALU instruction.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct AluInstruction {
+    /// Vector ALU opcode (bits 0..6 of word2 in canary's layout).
+    pub vector_opcode: u8,
+    /// Scalar ALU opcode (bits 7..13 of word2).
+    pub scalar_opcode: u8,
+    /// Destination register index for vector result (7 bits).
+    pub vector_dest: u8,
+    /// Destination register index for scalar result (7 bits).
+    pub scalar_dest: u8,
+    /// 4-bit write mask for the vector result (x/y/z/w).
+    pub vector_write_mask: u8,
+    /// 4-bit write mask for the scalar result.
+    pub scalar_write_mask: u8,
+    /// Set when the instruction should write to the export bank (position,
+    /// interpolators, color, etc.) instead of the general register file.
+    pub vector_dest_is_export: bool,
+    /// Selects `ps` (previous scalar result) as the scalar operand when set.
+    pub scalar_src_is_ps: bool,
+    /// Source register indices (at most 3 for vector ops).
+    pub src_a: u8,
+    pub src_b: u8,
+    pub src_c: u8,
+    /// Set when the instruction is predicated; skipped if the predicate
+    /// doesn't match `predicate_condition`.
+    pub predicated: bool,
+    pub predicate_condition: bool,
+    /// Raw dwords — preserved verbatim so the translator / interpreter can
+    /// reach into fields we haven't parsed explicitly yet.
+    pub raw: [u32; 3],
+}
+
+/// Decode a 3-dword ALU triple.
+pub fn decode_alu(words: [u32; 3]) -> AluInstruction {
+    let w0 = words[0];
+    let _w1 = words[1];
+    let w2 = words[2];
+    AluInstruction {
+        vector_opcode: (w2 & 0x3F) as u8,
+        scalar_opcode: ((w2 >> 6) & 0x3F) as u8,
+        vector_dest: ((w2 >> 16) & 0x7F) as u8,
+        scalar_dest: ((w2 >> 24) & 0x7F) as u8,
+        vector_write_mask: ((w2 >> 12) & 0xF) as u8,
+        scalar_write_mask: ((w2 >> 8) & 0xF) as u8,
+        vector_dest_is_export: ((w2 >> 23) & 1) != 0,
+        scalar_src_is_ps: ((w0 >> 26) & 1) != 0,
+        src_a: (w0 & 0xFF) as u8,
+        src_b: ((w0 >> 8) & 0xFF) as u8,
+        src_c: ((w0 >> 16) & 0xFF) as u8,
+        predicated: ((w0 >> 27) & 1) != 0,
+        predicate_condition: ((w0 >> 28) & 1) != 0,
+        raw: words,
+    }
+}
+
+/// Vector ALU opcodes we reference by name. Values match canary's
+/// `AluVectorOpcode` enum in `ucode.h:1354`.
+pub mod vop {
+    pub const ADD: u8 = 0;
+    pub const MUL: u8 = 1;
+    pub const MAX: u8 = 2;
+    pub const MIN: u8 = 3;
+    pub const SEQ: u8 = 4;
+    pub const SGT: u8 = 5;
+    pub const SGE: u8 = 6;
+    pub const SNE: u8 = 7;
+    pub const FRC: u8 = 8;
+    pub const TRUNC: u8 = 9;
+    pub const FLOOR: u8 = 10;
+    pub const MAD: u8 = 11;
+    pub const CND_EQ: u8 = 12;
+    pub const CND_GE: u8 = 13;
+    pub const CND_GT: u8 = 14;
+    pub const DOT4: u8 = 15;
+    pub const DOT3: u8 = 16;
+    pub const DOT2_ADD: u8 = 17;
+    pub const CUBE: u8 = 18;
+    pub const MAX4: u8 = 19;
+    pub const SETP_EQ_PUSH: u8 = 20;
+    pub const SETP_NE_PUSH: u8 = 21;
+    pub const SETP_GT_PUSH: u8 = 22;
+    pub const SETP_GE_PUSH: u8 = 23;
+    pub const KILL_EQ: u8 = 24;
+    pub const KILL_GT: u8 = 25;
+    pub const KILL_GE: u8 = 26;
+    pub const KILL_NE: u8 = 27;
+    pub const DST: u8 = 28;
+    pub const MAX_A: u8 = 29;
+}
+
+/// Scalar ALU opcodes. Values match canary's `AluScalarOpcode` enum in
+/// `ucode.h:1001`.
+pub mod sop {
+    pub const ADDS: u8 = 0;
+    pub const ADDS_PREV: u8 = 1;
+    pub const MULS: u8 = 2;
+    pub const MULS_PREV: u8 = 3;
+    pub const MULS_PREV2: u8 = 4;
+    pub const MAXS: u8 = 5;
+    pub const MINS: u8 = 6;
+    pub const SEQS: u8 = 7;
+    pub const SGTS: u8 = 8;
+    pub const SGES: u8 = 9;
+    pub const SNES: u8 = 10;
+    pub const FRCS: u8 = 11;
+    pub const TRUNCS: u8 = 12;
+    pub const FLOORS: u8 = 13;
+    pub const EXP: u8 = 14;
+    pub const LOGC: u8 = 15;
+    pub const LOG: u8 = 16;
+    pub const RCPC: u8 = 17;
+    pub const RCPF: u8 = 18;
+    pub const RCP: u8 = 19;
+    pub const RSQC: u8 = 20;
+    pub const RSQF: u8 = 21;
+    pub const RSQ: u8 = 22;
+    pub const MAXAS: u8 = 23;
+    pub const MAXASF: u8 = 24;
+    pub const SUBS: u8 = 25;
+    pub const SUBS_PREV: u8 = 26;
+    pub const SETP_EQ: u8 = 27;
+    pub const SETP_NE: u8 = 28;
+    pub const SETP_GT: u8 = 29;
+    pub const SETP_GE: u8 = 30;
+    pub const SETP_INV: u8 = 31;
+    pub const SETP_POP: u8 = 32;
+    pub const SETP_CLR: u8 = 33;
+    pub const SETP_RSTR: u8 = 34;
+    pub const KILLS_EQ: u8 = 35;
+    pub const KILLS_GT: u8 = 36;
+    pub const KILLS_GE: u8 = 37;
+    pub const KILLS_NE: u8 = 38;
+    pub const KILLS_ONE: u8 = 39;
+    pub const SQRT: u8 = 40;
+    pub const MULSC0: u8 = 42;
+    pub const MULSC1: u8 = 43;
+    pub const ADDSC0: u8 = 44;
+    pub const ADDSC1: u8 = 45;
+    pub const SUBSC0: u8 = 46;
+    pub const SUBSC1: u8 = 47;
+    pub const SIN: u8 = 48;
+    pub const COS: u8 = 49;
+    pub const RETAIN_PREV: u8 = 50;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Regression: our table previously drifted from canary's values (e.g.
+    /// `MAXS=6` when canary says 5, shifting everything through SQRT). Pin
+    /// the most-often-used scalar + vector opcodes here.
+    #[test]
+    fn opcodes_match_canary_values() {
+        // Scalar.
+        assert_eq!(sop::MAXS, 5);
+        assert_eq!(sop::MINS, 6);
+        assert_eq!(sop::SEQS, 7);
+        assert_eq!(sop::EXP, 14);
+        assert_eq!(sop::LOG, 16);
+        assert_eq!(sop::RCP, 19);
+        assert_eq!(sop::RSQ, 22);
+        assert_eq!(sop::SUBS, 25);
+        assert_eq!(sop::SETP_EQ, 27);
+        assert_eq!(sop::KILLS_EQ, 35);
+        assert_eq!(sop::SQRT, 40);
+        assert_eq!(sop::SIN, 48);
+        assert_eq!(sop::RETAIN_PREV, 50);
+        // Vector.
+        assert_eq!(vop::SNE, 7);
+        assert_eq!(vop::CND_EQ, 12);
+        assert_eq!(vop::MAX4, 19);
+        assert_eq!(vop::KILL_EQ, 24);
+        assert_eq!(vop::DST, 28);
+    }
+
+    #[test]
+    fn decode_extracts_opcodes_and_dests() {
+        // Build a minimal ALU word:
+        //   vector_opcode = ADD (0), scalar_opcode = RCP (22),
+        //   vector_dest = 3, scalar_dest = 7, vector_write_mask = 0xF
+        let w2 = (vop::ADD as u32)
+            | ((sop::RCP as u32) << 6)
+            | (0xF << 12) // vector_write_mask
+            | (3u32 << 16) // vector_dest
+            | (7u32 << 24); // scalar_dest
+        let alu = decode_alu([0, 0, w2]);
+        assert_eq!(alu.vector_opcode, vop::ADD);
+        assert_eq!(alu.scalar_opcode, sop::RCP);
+        assert_eq!(alu.vector_dest, 3);
+        assert_eq!(alu.scalar_dest, 7);
+        assert_eq!(alu.vector_write_mask, 0xF);
+    }
+}
diff --git a/crates/xenia-gpu/src/ucode/control_flow.rs b/crates/xenia-gpu/src/ucode/control_flow.rs
new file mode 100644
index 0000000..941a49d
--- /dev/null
+++ b/crates/xenia-gpu/src/ucode/control_flow.rs
@@ -0,0 +1,173 @@
+//! Xenos control-flow clause decoder.
+//!
+//! A shader's CF block is a sequence of 48-bit clauses packed two-per-
+//! three-dword row. Each clause encodes an opcode and type-specific fields
+//! (exec addr/count, loop start/end, branch target, etc.).
+//!
+//! Spec at `xenia-canary/src/xenia/gpu/ucode.h:87-256`. We cover the subset
+//! the uber-shader needs: `Exec*`, `Loop*`, `Alloc`, `Jmp`, `Call/Ret`,
+//! `Exit`. Unknown opcodes are classified as `Unknown { opcode }` so the
+//! translator can log + degrade.
+
+/// Parsed representation of one CF clause.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ControlFlowInstruction {
+    /// `kExec` / `kExecEnd` — execute a range of ALU/fetch instructions.
+    Exec {
+        /// Instruction-block dword index where this clause's instructions start,
+        /// expressed in **triple units** (each inst = 3 dwords).
+        address: u32,
+        /// Number of triples to execute.
+        count: u32,
+        /// The ALU-vs-fetch sequence bitmap (2 bits per instruction).
+        sequence: u32,
+        /// True when this clause ends the shader.
+        is_end: bool,
+        /// True if predicated; skip when predicate != predicate_condition.
+        predicated: bool,
+        predicate_condition: bool,
+    },
+    /// `kLoopStart` — begin a `aL` loop referencing a loop constant.
+    LoopStart { address: u32, loop_id: u32 },
+    /// `kLoopEnd` — close the loop; `address` points at the matching start.
+    LoopEnd { address: u32, loop_id: u32 },
+    /// `kCondJmp` — conditional jump to another CF index.
+    CondJmp {
+        target: u32,
+        predicated: bool,
+        predicate_condition: bool,
+    },
+    /// `kCondCall` — call into another CF subroutine.
+    CondCall { target: u32 },
+    /// `kReturn` — return from subroutine.
+    Return,
+    /// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
+    Alloc { size: u32, kind: AllocKind },
+    /// Exit the shader (terminal).
+    Exit,
+    /// Unknown / unhandled opcode.
+    Unknown { opcode: u8 },
+}
+
+/// Export target types for `kAlloc` clauses.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AllocKind {
+    Position,
+    Interpolators,
+    Colors,
+    Memexport,
+    Other,
+}
+
+impl AllocKind {
+    fn from_bits(b: u32) -> Self {
+        match b & 0x7 {
+            0 => AllocKind::Position,
+            1 => AllocKind::Interpolators,
+            2 => AllocKind::Colors,
+            3 => AllocKind::Memexport,
+            _ => AllocKind::Other,
+        }
+    }
+}
+
+/// Decode one row (three consecutive CF dwords) into two CF clauses.
+///
+/// Word layout per canary (`ucode.h:218-256`):
+///   - word0 + lo16(word1) → CF_A's 48-bit payload
+///   - hi16(word1) + word2 → CF_B's 48-bit payload
+///
+/// The opcode lives in the top 4 bits of the 48-bit payload (= bits 44..47).
+pub fn decode_cf_pair(word0: u32, word1: u32, word2: u32) -> (ControlFlowInstruction, ControlFlowInstruction) {
+    // Build each 48-bit value as u64; LE within the clause.
+    let a = (word0 as u64) | ((word1 as u64 & 0xFFFF) << 32);
+    let b = ((word1 as u64 >> 16) & 0xFFFF) | ((word2 as u64) << 16);
+    (decode_single(a), decode_single(b))
+}
+
+fn decode_single(payload: u64) -> ControlFlowInstruction {
+    // Top 4 bits of the 48-bit payload.
+    let opcode = ((payload >> 44) & 0xF) as u8;
+    // Predicate bit + condition live at the 28..30 range for exec/jmp. Rough
+    // extraction — good enough for the interpreter, which logs unknowns.
+    let predicated = ((payload >> 28) & 1) != 0;
+    let predicate_condition = ((payload >> 29) & 1) != 0;
+
+    match opcode {
+        0 => ControlFlowInstruction::Exec {
+            address: (payload & 0xFFF) as u32,
+            count: ((payload >> 12) & 0x7) as u32,
+            sequence: ((payload >> 16) & 0xFFF) as u32,
+            is_end: false,
+            predicated,
+            predicate_condition,
+        },
+        1 => ControlFlowInstruction::Exit,
+        2 => ControlFlowInstruction::Exec {
+            address: (payload & 0xFFF) as u32,
+            count: ((payload >> 12) & 0x7) as u32,
+            sequence: ((payload >> 16) & 0xFFF) as u32,
+            is_end: true,
+            predicated,
+            predicate_condition,
+        },
+        6 => ControlFlowInstruction::LoopStart {
+            address: (payload & 0x3FF) as u32,
+            loop_id: ((payload >> 16) & 0x1F) as u32,
+        },
+        7 => ControlFlowInstruction::LoopEnd {
+            address: (payload & 0x3FF) as u32,
+            loop_id: ((payload >> 16) & 0x1F) as u32,
+        },
+        8 => ControlFlowInstruction::CondCall {
+            target: (payload & 0x3FF) as u32,
+        },
+        9 => ControlFlowInstruction::Return,
+        10 => ControlFlowInstruction::CondJmp {
+            target: (payload & 0x3FF) as u32,
+            predicated,
+            predicate_condition,
+        },
+        12 => ControlFlowInstruction::Alloc {
+            size: (payload & 0x7) as u32,
+            kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
+        },
+        other => ControlFlowInstruction::Unknown { opcode: other },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn opcode_exit_decodes() {
+        // opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
+        let payload: u64 = 1u64 << 44;
+        let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
+        let cf = decode_cf_pair(hi, lo, 0).0;
+        assert_eq!(cf, ControlFlowInstruction::Exit);
+    }
+
+    #[test]
+    fn opcode_exec_end_carries_address_count() {
+        // opcode 2 (ExecEnd), address=4, count=2, sequence=0.
+        let payload: u64 = (2u64 << 44) | (2u64 << 12) | 4;
+        let hi = (payload & 0xFFFF_FFFF) as u32;
+        let lo = ((payload >> 32) & 0xFFFF) as u32;
+        let cf = decode_cf_pair(hi, lo, 0).0;
+        match cf {
+            ControlFlowInstruction::Exec {
+                address,
+                count,
+                is_end,
+                ..
+            } => {
+                assert_eq!(address, 4);
+                assert_eq!(count, 2);
+                assert!(is_end);
+            }
+            other => panic!("expected Exec, got {other:?}"),
+        }
+    }
+}
diff --git a/crates/xenia-gpu/src/ucode/fetch.rs b/crates/xenia-gpu/src/ucode/fetch.rs
new file mode 100644
index 0000000..85d1bba
--- /dev/null
+++ b/crates/xenia-gpu/src/ucode/fetch.rs
@@ -0,0 +1,117 @@
+//! Xenos fetch (vertex + texture) instruction decoder.
+//!
+//! Like ALU instructions, fetches are 96 bits (3 dwords). The opcode lives
+//! in the low 5 bits of word0. We split them into `VertexFetch` and
+//! `TextureFetch` structurally because their operand layouts differ.
+//!
+//! Reference: `xenia-canary/src/xenia/gpu/ucode.h:690-877`.
+
+/// Decoded fetch instruction.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum FetchInstruction {
+    Vertex(VertexFetch),
+    Texture(TextureFetch),
+    /// Unknown / minor variants we don't model yet.
+    Unknown { opcode: u8, raw: [u32; 3] },
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct VertexFetch {
+    /// Vertex fetch constant index (0..=95).
+    pub fetch_const: u8,
+    /// Source register index (vertex index in r#).
+    pub src_register: u8,
+    /// Destination register for the fetched value.
+    pub dest_register: u8,
+    /// 4-bit write mask.
+    pub dest_write_mask: u8,
+    pub raw: [u32; 3],
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct TextureFetch {
+    /// Texture fetch constant index (0..=31).
+    pub fetch_const: u8,
+    pub src_register: u8,
+    pub dest_register: u8,
+    pub dest_write_mask: u8,
+    /// Dimension: 0=1D, 1=2D, 2=3D/stacked, 3=cube.
+    pub dimension: u8,
+    pub raw: [u32; 3],
+}
+
+/// Opcodes (low 5 bits of word0). From `ucode.h`.
+pub mod op {
+    pub const VERTEX_FETCH: u8 = 0x00;
+    pub const TEXTURE_FETCH: u8 = 0x01;
+    pub const GET_TEXTURE_BORDER_COLOR_FRAC: u8 = 0x16;
+    pub const GET_TEXTURE_COMPUTED_LOD: u8 = 0x17;
+    pub const GET_TEXTURE_WEIGHTS: u8 = 0x18;
+    pub const GET_TEXTURE_GRADIENTS: u8 = 0x19;
+    pub const SET_TEXTURE_LOD: u8 = 0x1A;
+    pub const SET_TEXTURE_GRADIENTS_HORZ: u8 = 0x1B;
+    pub const SET_TEXTURE_GRADIENTS_VERT: u8 = 0x1C;
+}
+
+pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
+    let w0 = words[0];
+    let w1 = words[1];
+    let opcode = (w0 & 0x1F) as u8;
+    match opcode {
+        op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
+            fetch_const: ((w0 >> 5) & 0x1F) as u8,
+            src_register: ((w0 >> 17) & 0x7F) as u8,
+            dest_register: ((w0 >> 10) & 0x7F) as u8,
+            dest_write_mask: ((w1 >> 23) & 0xF) as u8,
+            raw: words,
+        }),
+        op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
+            fetch_const: ((w0 >> 5) & 0x1F) as u8,
+            src_register: ((w0 >> 17) & 0x7F) as u8,
+            dest_register: ((w0 >> 10) & 0x7F) as u8,
+            dest_write_mask: ((w1 >> 23) & 0xF) as u8,
+            dimension: ((w1 >> 29) & 0x3) as u8,
+            raw: words,
+        }),
+        _ => FetchInstruction::Unknown { opcode, raw: words },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn decode_vertex_fetch() {
+        // opcode=0 (vertex), fetch_const=5, src=2, dest=7.
+        let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
+        let v = decode_fetch([w0, 0, 0]);
+        match v {
+            FetchInstruction::Vertex(vf) => {
+                assert_eq!(vf.fetch_const, 5);
+                assert_eq!(vf.src_register, 2);
+                assert_eq!(vf.dest_register, 7);
+            }
+            other => panic!("expected Vertex, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn decode_texture_fetch() {
+        let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
+        let t = decode_fetch([w0, (2u32 << 29), 0]);
+        match t {
+            FetchInstruction::Texture(tf) => {
+                assert_eq!(tf.fetch_const, 3);
+                assert_eq!(tf.dimension, 2);
+            }
+            other => panic!("expected Texture, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn unknown_opcode_is_classified() {
+        let v = decode_fetch([0x16, 0, 0]); // GET_TEXTURE_BORDER_COLOR_FRAC
+        assert!(matches!(v, FetchInstruction::Unknown { opcode: 0x16, .. }));
+    }
+}
diff --git a/crates/xenia-gpu/src/ucode/mod.rs b/crates/xenia-gpu/src/ucode/mod.rs
new file mode 100644
index 0000000..70d2349
--- /dev/null
+++ b/crates/xenia-gpu/src/ucode/mod.rs
@@ -0,0 +1,249 @@
+//! Xenos (ATI R500-family) shader microcode decoder.
+//!
+//! Ground truth: `xenia-canary/src/xenia/gpu/ucode.h`. We parse only what a
+//! shader *interpreter* (P3 uber-shader) needs: control-flow clauses, ALU
+//! instructions (vector + scalar pipes), and fetch instructions (vertex +
+//! texture). The uber-shader consumes this IR directly; when a WGSL-emitting
+//! translator comes online in P7, it reuses the same parser.
+//!
+//! ## Binary layout
+//!
+//! A compiled shader has two sections back-to-back:
+//!
+//! 1. **Control-flow block** — `cf_count` 64-bit clause pairs. Canary packs
+//!    two clauses into three 32-bit words:
+//!    ```text
+//!    word0  word1  word2
+//!    [-CF_A (48)-][-CF_B (48)-]
+//!    ```
+//!    Word 0 is the low 32 of CF_A; word 1's low 16 bits finish CF_A and
+//!    its high 16 bits start CF_B; word 2 holds CF_B's remaining 32 bits.
+//!
+//! 2. **Instruction block** — variable-size array of 96-bit ALU / fetch
+//!    instructions. Each control-flow clause of kind `Exec*` references a
+//!    contiguous range of these by `(address, count)` in dwords * 3.
+//!
+//! We read big-endian dwords straight out of guest memory (the `raw`
+//! `&[u32]` slice is already host-endian-corrected by the PM4 executor that
+//! cached the shader blob). See `ucode.h:218-256` for the exec clause bit
+//! layout and `:700-877` for the fetch/ALU mix.
+
+pub mod alu;
+pub mod control_flow;
+pub mod fetch;
+
+use self::alu::AluInstruction;
+use self::control_flow::{AllocKind, ControlFlowInstruction, decode_cf_pair};
+use self::fetch::FetchInstruction;
+
+/// CF-clause kind codes encoded into the WGSL-facing packed shader. Kept
+/// in sync with `shaders/xenos_interp.wgsl`'s `CF_KIND_*` constants.
+pub mod cf_kind {
+    pub const EXEC: u32 = 0;
+    pub const EXEC_END: u32 = 1;
+    pub const ALLOC: u32 = 2;
+    pub const EXIT: u32 = 3;
+    pub const LOOP_START: u32 = 4;
+    pub const LOOP_END: u32 = 5;
+    pub const COND_JMP: u32 = 6;
+    pub const COND_CALL: u32 = 7;
+    pub const RETURN: u32 = 8;
+    pub const UNKNOWN: u32 = 15;
+}
+
+/// Alloc-kind codes, packed into the aux dword of an `Alloc` clause.
+pub mod cf_alloc_kind {
+    pub const POSITION: u32 = 0;
+    pub const INTERPOLATORS: u32 = 1;
+    pub const COLORS: u32 = 2;
+    pub const MEMEXPORT: u32 = 3;
+    pub const OTHER: u32 = 4;
+}
+
+/// Pack a [`ParsedShader`] into the dense dword layout the WGSL runtime
+/// interpreter expects:
+///
+/// ```text
+/// [0]                     cf_count
+/// [1 .. 1 + cf_count*3]   CF table: (kind, primary, aux) triples per clause
+/// [1 + cf_count*3 ..]     raw 3-dword instruction stream (ALU/fetch)
+/// ```
+///
+/// The CF table lets WGSL walk clauses without reconstructing bit-packed
+/// layouts on the GPU. Semantics per `kind`:
+///
+/// | kind        | primary                    | aux                          |
+/// |-------------|----------------------------|------------------------------|
+/// | EXEC/EXEC_END | address (in triples)      | (sequence<<8) \| count       |
+/// | ALLOC       | alloc_kind (see cf_alloc_kind) | size                    |
+/// | EXIT        | 0                          | 0                            |
+/// | LOOP_START  | address                    | loop_id                      |
+/// | LOOP_END    | address                    | loop_id                      |
+/// | COND_JMP    | target                     | predicate flags              |
+/// | COND_CALL   | target                     | 0                            |
+/// | RETURN      | 0                          | 0                            |
+/// | UNKNOWN     | opcode                     | 0                            |
+pub fn pack_for_wgsl(parsed: &ParsedShader) -> Vec<u32> {
+    let cf_count = parsed.cf.len() as u32;
+    let mut out = Vec::with_capacity(1 + (cf_count as usize) * 3 + parsed.instructions.len());
+    out.push(cf_count);
+    for clause in &parsed.cf {
+        let (kind, primary, aux) = encode_cf(*clause);
+        out.push(kind);
+        out.push(primary);
+        out.push(aux);
+    }
+    out.extend_from_slice(&parsed.instructions);
+    out
+}
+
+fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
+    use ControlFlowInstruction::*;
+    match c {
+        Exec {
+            address,
+            count,
+            sequence,
+            is_end,
+            predicated,
+            predicate_condition,
+        } => {
+            let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
+            let kind = if is_end { cf_kind::EXEC_END } else { cf_kind::EXEC }
+                | (pred_bits << 8);
+            (kind, address, (sequence << 8) | count)
+        }
+        Alloc { size, kind } => {
+            let akind = match kind {
+                AllocKind::Position => cf_alloc_kind::POSITION,
+                AllocKind::Interpolators => cf_alloc_kind::INTERPOLATORS,
+                AllocKind::Colors => cf_alloc_kind::COLORS,
+                AllocKind::Memexport => cf_alloc_kind::MEMEXPORT,
+                AllocKind::Other => cf_alloc_kind::OTHER,
+            };
+            (cf_kind::ALLOC, akind, size)
+        }
+        Exit => (cf_kind::EXIT, 0, 0),
+        LoopStart { address, loop_id } => (cf_kind::LOOP_START, address, loop_id),
+        LoopEnd { address, loop_id } => (cf_kind::LOOP_END, address, loop_id),
+        CondJmp {
+            target,
+            predicated,
+            predicate_condition,
+        } => {
+            let pred_bits = (predicated as u32) | ((predicate_condition as u32) << 1);
+            (cf_kind::COND_JMP, target, pred_bits)
+        }
+        CondCall { target } => (cf_kind::COND_CALL, target, 0),
+        Return => (cf_kind::RETURN, 0, 0),
+        Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
+    }
+}
+
+/// One instruction word set from the instruction-block section. Xenos packs
+/// ALU and fetch instructions identically (96 bits each); the owning exec
+/// clause's "sequence" bitmap decides which is which.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DecodedInstruction {
+    /// ALU pipe (vector ALU + optional co-issued scalar ALU).
+    Alu(AluInstruction),
+    /// Vertex or texture fetch.
+    Fetch(FetchInstruction),
+}
+
+/// Parsed shader: the control-flow clause list + the raw 32-bit instruction
+/// words. The uber-shader / translator is expected to index into
+/// `instructions` based on `(clause.address * 3, clause.count * 3)`.
+#[derive(Debug, Clone, Default)]
+pub struct ParsedShader {
+    pub cf: Vec<ControlFlowInstruction>,
+    /// Raw instruction dwords. Each 3-dword triple is one ALU or fetch
+    /// instruction; the owning `Exec` clause's `sequence` bitmap picks the
+    /// kind.
+    pub instructions: Vec<u32>,
+}
+
+/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
+/// microcode buffer (control flow + instructions). Heuristic: CF dword count
+/// is encoded in the first word's low 12 bits of the last exec clause —
+/// canary iterates until it hits a clause of kind `Exit`. We do the same.
+pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
+    let mut cf = Vec::new();
+    // CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
+    // layout). Walk pairs of 3 dwords per pair of clauses.
+    let mut i = 0usize;
+    while i + 2 < raw_dwords.len() {
+        let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
+        let (first, second) = a;
+        let seen_exit = matches!(
+            first,
+            ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
+        ) || matches!(
+            second,
+            ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
+        );
+        cf.push(first);
+        cf.push(second);
+        i += 3;
+        if seen_exit {
+            break;
+        }
+    }
+    // Everything after `i` dwords is the instruction block.
+    let instructions = raw_dwords[i..].to_vec();
+    ParsedShader { cf, instructions }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty_blob_parses_empty() {
+        let p = parse_shader(&[]);
+        assert!(p.cf.is_empty());
+        assert!(p.instructions.is_empty());
+    }
+
+    #[test]
+    fn pack_for_wgsl_layout_is_correct() {
+        // Build a tiny ParsedShader by hand and verify the packed form.
+        let parsed = ParsedShader {
+            cf: vec![
+                ControlFlowInstruction::Exec {
+                    address: 0x10,
+                    count: 3,
+                    sequence: 0b1010,
+                    is_end: false,
+                    predicated: false,
+                    predicate_condition: false,
+                },
+                ControlFlowInstruction::Exit,
+            ],
+            instructions: vec![0x1111, 0x2222, 0x3333],
+        };
+        let packed = pack_for_wgsl(&parsed);
+        assert_eq!(packed[0], 2, "cf_count");
+        // First clause: EXEC, address=0x10, aux = (sequence<<8)|count = 0x0A03
+        assert_eq!(packed[1] & 0xFF, cf_kind::EXEC);
+        assert_eq!(packed[2], 0x10);
+        assert_eq!(packed[3], (0b1010 << 8) | 3);
+        // Second clause: EXIT
+        assert_eq!(packed[4] & 0xFF, cf_kind::EXIT);
+        // Instruction block starts at 1 + 2*3 = 7
+        assert_eq!(packed[7..], [0x1111, 0x2222, 0x3333]);
+    }
+
+    #[test]
+    fn trivial_exit_clause_stops_parsing() {
+        // Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
+        // Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
+        let w0 = 0u32; // clause A body
+        let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
+        let w2 = 0u32;
+        let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
+        assert!(!p.cf.is_empty());
+        // Exit detected → remaining dword is instruction data.
+        assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
+    }
+}
diff --git a/crates/xenia-gpu/src/xenos_constants.rs b/crates/xenia-gpu/src/xenos_constants.rs
new file mode 100644
index 0000000..9ed2a17
--- /dev/null
+++ b/crates/xenia-gpu/src/xenos_constants.rs
@@ -0,0 +1,124 @@
+//! The "Xenos constants" block the WGSL interpreter consumes per draw.
+//!
+//! Mirrors the Xenos register-file regions that carry the per-draw constant
+//! values shaders reference at runtime:
+//!
+//! | Region | Base | Count | Size |
+//! |--------|------|-------|------|
+//! | ALU   | 0x4000 | 512 × vec4<f32> | 8 KB |
+//! | Fetch | 0x4800 | 256 × u32       | 1 KB |
+//! | Bool  | 0x4900 |  8 × u32        | 32 B |
+//! | Loop  | 0x4908 | 32 × u32        | 128 B |
+//!
+//! Total: ~9.2 KB, well under the 64 KB min uniform buffer size on all wgpu
+//! backends. The `XenosConstantsBlock` is declared `#[repr(C)]` + bytemuck
+//! `Pod` so it can be `bytemuck::bytes_of()`'d directly into a wgpu uniform
+//! buffer. The matching WGSL `struct XenosConstants` lives in
+//! `shaders/xenos_interp.wgsl`.
+
+use bytemuck::{Pod, Zeroable};
+
+use crate::register_file::RegisterFile;
+
+pub const ALU_CONSTANT_COUNT: usize = 512;
+pub const FETCH_CONSTANT_COUNT: usize = 256;
+pub const BOOL_CONSTANT_COUNT: usize = 8;
+pub const LOOP_CONSTANT_COUNT: usize = 32;
+
+pub const CONST_BASE_ALU: u32 = 0x4000;
+pub const CONST_BASE_FETCH: u32 = 0x4800;
+pub const CONST_BASE_BOOL: u32 = 0x4900;
+pub const CONST_BASE_LOOP: u32 = 0x4908;
+
+/// Per-draw constants block uploaded once to the uniform buffer at
+/// `@group(0) @binding(1)`.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct XenosConstantsBlock {
+    pub alu: [[f32; 4]; ALU_CONSTANT_COUNT],
+    pub fetch: [u32; FETCH_CONSTANT_COUNT],
+    pub bool_consts: [u32; BOOL_CONSTANT_COUNT],
+    pub loop_consts: [u32; LOOP_CONSTANT_COUNT],
+}
+
+// SAFETY: all fields are Pod arrays of Pod primitives; `#[repr(C)]` fixes
+// the layout. `bytemuck` derives `Pod` only when alignment + padding line
+// up, so manual `unsafe impl` is the right tool here.
+unsafe impl Zeroable for XenosConstantsBlock {}
+unsafe impl Pod for XenosConstantsBlock {}
+
+impl Default for XenosConstantsBlock {
+    fn default() -> Self {
+        Self {
+            alu: [[0.0; 4]; ALU_CONSTANT_COUNT],
+            fetch: [0; FETCH_CONSTANT_COUNT],
+            bool_consts: [0; BOOL_CONSTANT_COUNT],
+            loop_consts: [0; LOOP_CONSTANT_COUNT],
+        }
+    }
+}
+
+impl XenosConstantsBlock {
+    /// Size in bytes — exposed for tests + wgpu buffer sizing.
+    pub const SIZE: usize = std::mem::size_of::<Self>();
+
+    /// Snapshot the constants from a Xenos `RegisterFile` into a dense,
+    /// host-friendly layout the WGSL interpreter expects. ALU constants
+    /// (vec4 each) are 4 consecutive registers; fetch constants are u32.
+    pub fn snapshot(rf: &RegisterFile) -> Self {
+        let mut out = Self::default();
+        for i in 0..ALU_CONSTANT_COUNT {
+            let base = CONST_BASE_ALU + (i as u32) * 4;
+            out.alu[i] = [
+                f32::from_bits(rf.read(base)),
+                f32::from_bits(rf.read(base + 1)),
+                f32::from_bits(rf.read(base + 2)),
+                f32::from_bits(rf.read(base + 3)),
+            ];
+        }
+        for i in 0..FETCH_CONSTANT_COUNT {
+            out.fetch[i] = rf.read(CONST_BASE_FETCH + i as u32);
+        }
+        for i in 0..BOOL_CONSTANT_COUNT {
+            out.bool_consts[i] = rf.read(CONST_BASE_BOOL + i as u32);
+        }
+        for i in 0..LOOP_CONSTANT_COUNT {
+            out.loop_consts[i] = rf.read(CONST_BASE_LOOP + i as u32);
+        }
+        out
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Layout-sanity: total size is (512·16) + (256·4) + (8·4) + (32·4) =
+    /// 8192 + 1024 + 32 + 128 = 9376 bytes. If this number drifts, either
+    /// the constant counts changed or the compiler added padding; either
+    /// way we want to know at test time because the WGSL struct layout in
+    /// `xenos_interp.wgsl` depends on it.
+    #[test]
+    fn xenos_constants_block_size_is_stable() {
+        assert_eq!(XenosConstantsBlock::SIZE, 9376);
+    }
+
+    #[test]
+    fn snapshot_roundtrip_from_register_file() {
+        let mut rf = RegisterFile::new();
+        // Write a recognisable pattern to alu[0] = (1.0, 2.0, 3.0, 4.0)
+        rf.write(CONST_BASE_ALU + 0, f32::to_bits(1.0));
+        rf.write(CONST_BASE_ALU + 1, f32::to_bits(2.0));
+        rf.write(CONST_BASE_ALU + 2, f32::to_bits(3.0));
+        rf.write(CONST_BASE_ALU + 3, f32::to_bits(4.0));
+        rf.write(CONST_BASE_FETCH + 5, 0xDEAD_BEEF);
+        rf.write(CONST_BASE_BOOL, 0x1234);
+        rf.write(CONST_BASE_LOOP + 3, 0x5678);
+
+        let snap = XenosConstantsBlock::snapshot(&rf);
+        assert_eq!(snap.alu[0], [1.0, 2.0, 3.0, 4.0]);
+        assert_eq!(snap.fetch[5], 0xDEAD_BEEF);
+        assert_eq!(snap.bool_consts[0], 0x1234);
+        assert_eq!(snap.loop_consts[3], 0x5678);
+    }
+}