From 82f3d611e241bfd95776bff1cdfa0fbe5197682f Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sun, 3 May 2026 14:00:23 +0200
Subject: [PATCH] =?UTF-8?q?fix(gpu,kernel):=20KRNBUG-Vd-04=20/=20GPUBUG-00?=
 =?UTF-8?q?1=20/=20XMODBUG-013=20=E2=80=94=20VdSwap=20PM4=20ring=20path?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pre-fix VdSwap zero-filled the guest's reserved buffer with NOPs and
called `state.gpu.notify_xe_swap` directly — bypassing the ring, leaving
the PM4_XE_SWAP handler at gpu_system.rs:1232 dead code, and skipping
the PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) patch. Sylpheed's bloom/
blur "sample frame N for frame N+1" path samples fetch-constant slot 0
expecting the frontbuffer descriptor; without the patch, slot 0 stayed
stale and any shader sampling it read garbage.

This commit writes the canary VdSwap PM4 sequence directly into the
primary ring at the current write pointer (read via the shared MMIO
atomic), then advances WPTR over the injection. The natural CP drain
consumes PM4_XE_SWAP — bumping `swaps_seen` and patching fetch-constant
slot 0 — without going through any direct kernel→GPU bypass.

Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
  1) PM4_TYPE0(0x4800, count=6) + 6 fetch-header dwords (with
     base_address re-patched from virtual to physical >> 12).
  2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys
     + width + height.

Mechanism notes:
- buffer_ptr in xenia-rs is in the system command buffer, NOT the
  primary ring (verified empirically: buffer_ptr=0x4acd4df8 vs
  ring_base=0x0accb000, size 4 KB). Canary's VdSwap writes to
  buffer_ptr because its ring layout maps the reserved slot inside
  the ring; xenia-rs's doesn't, so we have to write at the actual
  ring WPTR address (cached on KernelState.ring_base from
  VdInitializeRingBuffer).
- The original "buffer_ptr zero-fill + bump WPTR by 64" path is
  preserved before the injection — it exposes any game-batched PM4
  packets and keeps the buffer_ptr region skippable per existing
  game compat behavior.
- A safety-net fallback at the end calls `notify_xe_swap` directly if
  swaps_seen didn't advance during the drain (e.g. a ring-arithmetic
  edge case). Idempotent — only fires when the PM4 path didn't.
- KRNBUG-Mm-04 deferred: virt→phys uses the masked stub
  `virt & 0x1FFF_FFFF`, sufficient for the standard heap.

Mechanical changes:
- crates/xenia-gpu/src/pm4.rs: add make_packet_type0 / type2 / type3
  helpers + round-trip unit test (mirrors canary xenos.h:1682-1709).
- crates/xenia-gpu/src/handle.rs: add mmio_cp_rb_wptr_load accessor
  (Acquire-load) so the kernel can compute ring offsets.
- crates/xenia-kernel/src/state.rs: cache ring_base / ring_size_dwords
  on KernelState (set by VdInitializeRingBuffer).
- crates/xenia-kernel/src/exports.rs: rewrite the vd_swap PM4-emit
  block; patch fetch_dwords[1] base_address virt→phys before injection.

Verification at -n 100M lockstep:
  swaps:                2 → 2     (game fires VdSwap exactly twice)
  draws:                0 → 0     (gated by Phases D+E)
  fallback warning:     0 occurrences (PM4 path consumed both swaps)
  instructions:         ~100M
Tests: 552 passing (553 with new pm4 round-trip test). Lockstep
stable-fields determinism: byte-identical across two 100M runs.

The "swaps > 2" prediction in the audit's plan assumed the game would
fire VdSwap more often once the path worked; empirically Sylpheed only
calls VdSwap twice within 100M instructions (this is the renderer
plateau the audit identified). The success criterion for Phase C is
that the PM4 path is now operational, which Phases D+E require for
visible draws.

Closes KRNBUG-Vd-04, GPUBUG-001, XMODBUG-013.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/xenia-gpu/src/handle.rs     |   8 ++
 crates/xenia-gpu/src/pm4.rs        |  54 +++++++++++
 crates/xenia-kernel/src/exports.rs | 140 ++++++++++++++++++++++-------
 crates/xenia-kernel/src/state.rs   |   8 ++
 4 files changed, 176 insertions(+), 34 deletions(-)

diff --git a/crates/xenia-gpu/src/handle.rs b/crates/xenia-gpu/src/handle.rs
index 4205f77..86607cf 100644
--- a/crates/xenia-gpu/src/handle.rs
+++ b/crates/xenia-gpu/src/handle.rs
@@ -358,6 +358,14 @@ impl GpuBackend {
             .store(cur.wrapping_add(dwords), Ordering::Release);
     }
 
+    /// Read the current ring write pointer from the shared MMIO atomic.
+    /// Acquire-load so any prior ring-memory writes the caller did before
+    /// asking are not reordered past this read by the compiler. Used by
+    /// `vd_swap` to compute the ring offset for direct PM4 injection.
+    pub fn mmio_cp_rb_wptr_load(&self) -> u32 {
+        self.mmio().cp_rb_wptr.load(Ordering::Acquire)
+    }
+
     /// Drain any PM4 packets currently exposed by the ring (i.e., up to
     /// the current `CP_RB_WPTR`). Inline mode runs the synchronous
     /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks
diff --git a/crates/xenia-gpu/src/pm4.rs b/crates/xenia-gpu/src/pm4.rs
index 370b0ba..bb95eef 100644
--- a/crates/xenia-gpu/src/pm4.rs
+++ b/crates/xenia-gpu/src/pm4.rs
@@ -128,6 +128,27 @@ pub enum PacketKind {
     },
 }
 
+/// Build a Type-0 register-write packet header. Mirrors canary's
+/// `MakePacketType0` at `xenia-canary/src/xenia/gpu/xenos.h:1682`.
+/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
+pub fn make_packet_type0(reg_index: u16, count: u16) -> u32 {
+    debug_assert!(reg_index <= 0x7FFF);
+    debug_assert!(count >= 1 && count as u32 <= 0x4000);
+    (0u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | (reg_index as u32 & 0x7FFF)
+}
+
+/// Build a Type-2 NOP packet header. Single dword, no payload.
+pub const fn make_packet_type2() -> u32 {
+    2u32 << 30
+}
+
+/// Build a Type-3 command packet header. Mirrors canary's `MakePacketType3`.
+/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
+pub fn make_packet_type3(opcode: u8, count: u16) -> u32 {
+    debug_assert!(count >= 1 && count as u32 <= 0x4000);
+    (3u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | ((opcode as u32 & 0x7F) << 8)
+}
+
 /// Decode a single PM4 packet header.
 pub fn decode(header: u32) -> PacketHeader {
     match header >> 30 {
@@ -229,4 +250,37 @@ mod tests {
         assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
         assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
     }
+
+    #[test]
+    fn make_packet_helpers_round_trip_through_decode() {
+        // Type-0: SHADER_CONSTANT_FETCH_00_0 (0x4800), count=6.
+        let t0 = make_packet_type0(0x4800, 6);
+        match decode(t0).kind {
+            PacketKind::Type0 { base_index, count, write_one } => {
+                assert_eq!(base_index, 0x4800);
+                assert_eq!(count, 6);
+                assert!(!write_one);
+            }
+            other => panic!("expected Type0, got {other:?}"),
+        }
+        assert_eq!(decode(t0).total_dwords, 7);
+
+        // Type-3: PM4_XE_SWAP, count=4 (signature + addr + W + H).
+        let t3 = make_packet_type3(PM4_XE_SWAP, 4);
+        match decode(t3).kind {
+            PacketKind::Type3 { opcode, count, predicated } => {
+                assert_eq!(opcode, PM4_XE_SWAP);
+                assert_eq!(count, 4);
+                assert!(!predicated);
+            }
+            other => panic!("expected Type3, got {other:?}"),
+        }
+        assert_eq!(decode(t3).total_dwords, 5);
+
+        // Type-2: NOP.
+        let t2 = make_packet_type2();
+        assert_eq!(t2, 0x8000_0000);
+        assert_eq!(decode(t2).kind, PacketKind::Type2);
+        assert_eq!(decode(t2).total_dwords, 1);
+    }
 }
diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs
index 1e570a3..a4f34af 100644
--- a/crates/xenia-kernel/src/exports.rs
+++ b/crates/xenia-kernel/src/exports.rs
@@ -2210,6 +2210,14 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
     let ptr = ctx.gpr[3] as u32;
     let size_log2 = ctx.gpr[4] as u32;
     state.gpu.initialize_ring_buffer(ptr, size_log2);
+    // Cache the ring layout on KernelState so `vd_swap` can write PM4
+    // packets directly into ring memory at the current WPTR (the GPU
+    // backend lives on a worker thread under `--gpu-thread` so we can't
+    // read its `ring.base` from the kernel side without a channel hop).
+    // Per canary: size_log2 is log2(size in BYTES), so size in dwords =
+    // 2^size_log2 / 4 = 1 << (size_log2 - 2).
+    state.ring_base = ptr;
+    state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
     ctx.gpr[3] = 0;
 }
 
@@ -2287,15 +2295,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     } else {
         (1280, 720)
     };
-    // The guest also writes the virtual frontbuffer address to *frontbuffer_ptr.
-    // Prefer that when the fetch-derived address is zero.
-    let frontbuffer_addr = if frontbuffer_virt != 0 {
+    // Translate frontbuffer virtual → physical. Per canary VdSwap_entry
+    // (xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:468-471),
+    // the GPU consumes physical addresses; the fetch header carries a
+    // virtual address. KRNBUG-Mm-04: our MmGetPhysicalAddress is a masked
+    // stub; a `virt & 0x1FFF_FFFF` is the equivalent translation today.
+    let phys_mask: u32 = 0x1FFF_FFFF;
+    let frontbuffer_addr_virt = if frontbuffer_virt != 0 {
         frontbuffer_virt
     } else if frontbuffer_ptr != 0 {
         mem.read_u32(frontbuffer_ptr)
     } else {
         0
     };
+    let frontbuffer_addr = frontbuffer_addr_virt & phys_mask;
     let texture_format = if texture_format_ptr != 0 {
         mem.read_u32(texture_format_ptr)
     } else {
@@ -2307,37 +2320,87 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
         0
     };
 
-    // First-Pixels M2b — two-part commit path.
-    //
-    // 1) Fill the guest's reserved 64-dword slot with PM4 Type-2 NOPs
-    //    (0x8000_0000). Some titles consume `buffer_ptr..+256` after
-    //    VdSwap returns and assume they're skippable. Matches the prior
-    //    behaviour.
-    if buffer_ptr != 0 {
-        for i in 0..64u32 {
-            mem.write_u32(buffer_ptr + i * 4, 0x8000_0000);
-        }
+    // Patch the fetch header's base_address from virtual to physical so the
+    // GPU's fetch-constant slot 0 sees the addressable frontbuffer. dword_1
+    // high 20 bits hold base_address (>>12). Mirrors canary at
+    // xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:479.
+    if frontbuffer_addr != 0 {
+        let base_phys_shifted = frontbuffer_addr >> 12;
+        fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | (base_phys_shifted << 12);
     }
 
-    // 2) Advance the ring's write pointer by 64 dwords (the slot the
-    //    game "reserved" via VdSwap's buffer_ptr convention). Despite
-    //    `buffer_ptr` being in the system command buffer rather than the
-    //    primary ring, the 64-dword bump correctly exposes packets the
-    //    game wrote into the primary ring since our last `sync_with_mmio`.
-    //    Empirically (pre-M2b) this path drained 512 packets through 1 B
-    //    guest instructions — the setup packets that D3D9-init writes.
+    // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write a real PM4 sequence
+    // into the guest's reserved 64-dword ring slot, then let the natural
+    // CP drain consume PM4_XE_SWAP. The pre-fix path filled the slot with
+    // NOPs and called `notify_xe_swap` directly — bypassing the ring,
+    // leaving the PM4_XE_SWAP handler dead code, and skipping the
+    // fetch-constant-slot-0 patch that bloom/blur "sample frame N for
+    // frame N+1" paths depend on.
     //
-    //    M1.5: bump routes through the shared MMIO atomic so both
-    //    backends produce the same observable WPTR sequence. Inline
-    //    picks it up on its next `sync_with_mmio`; threaded's worker
-    //    observes the same atomic.
+    // Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
+    //   1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0=0x4800, count=6) + 6 dwords
+    //      (the patched fetch header).
+    //   2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys
+    //      + width + height.
+    //   3) PM4_TYPE2 NOP fill to slot end (64 dwords total).
+    let pre_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
+
+    // The original M2b path zero-filled buffer_ptr (in the system command
+    // buffer) and bumped WPTR by 64 to expose the game's own ring writes.
+    // Keep that untouched — the game still expects buffer_ptr to be a
+    // skippable scratch area, and the bump still exposes any game-batched
+    // PM4 packets for the drain.
+    if buffer_ptr != 0 {
+        for i in 0..64u32 {
+            mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
+        }
+    }
     state.gpu.extend_write_ptr_by(64);
 
-    // Drain the exposed packets — the D3D9-init setup the game batched
-    // into the ring plus any leftovers. The synthetic `PM4_XE_SWAP`
-    // packet the prior code wrote at `buffer_ptr` is **not** written
-    // anymore; the drain's `ring.base + rptr*4` walk couldn't find it
-    // anyway (see the pre-M2b `swaps=0 with packets=512` failure mode).
+    // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write the canary PM4
+    // sequence directly into the primary ring at the current WPTR (after
+    // the buffer_ptr bump above), then advance WPTR over our injection.
+    // Mirrors xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
+    //   1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) + 6 dwords of patched
+    //      fetch header.
+    //   2) PM4_TYPE3(PM4_XE_SWAP, 4) + signature + frontbuffer_phys + W + H.
+    // The drain below picks these up via the natural CP path; the
+    // PM4_TYPE0 patches fetch-constant slot 0 with the frontbuffer's
+    // physical descriptor (Sylpheed's bloom/blur path samples this for
+    // frame N+1, so a stale slot 0 gives garbage).
+    let mut injected_dwords: u32 = 0;
+    if state.ring_base != 0 && state.ring_size_dwords != 0 {
+        const SHADER_CONSTANT_FETCH_00_0: u16 = 0x4800;
+        let packets: [u32; 12] = [
+            xenia_gpu::pm4::make_packet_type0(SHADER_CONSTANT_FETCH_00_0, 6),
+            fetch_dwords[0],
+            fetch_dwords[1],
+            fetch_dwords[2],
+            fetch_dwords[3],
+            fetch_dwords[4],
+            fetch_dwords[5],
+            xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
+            xenia_gpu::pm4::SWAP_SIGNATURE,
+            frontbuffer_addr,
+            width,
+            height,
+        ];
+        // The MMIO WPTR is unmodulo'd; modulo by ring size to get the
+        // ring offset, then add base + offset*4 for the guest address.
+        let mmio_wptr = state.gpu.mmio_cp_rb_wptr_load();
+        for (i, dword) in packets.iter().enumerate() {
+            let ofs = (mmio_wptr.wrapping_add(i as u32)) % state.ring_size_dwords;
+            mem.write_u32(state.ring_base.wrapping_add(ofs.wrapping_mul(4)), *dword);
+        }
+        state.gpu.extend_write_ptr_by(packets.len() as u32);
+        injected_dwords = packets.len() as u32;
+    }
+
+    // Drain the exposed packets. The PM4_XE_SWAP we just wrote will be
+    // consumed by the existing handler at gpu_system.rs:1232, which calls
+    // `notify_xe_swap` itself — bumping `swaps_seen` and recording
+    // `last_swap` exactly as the old direct call did. Other packets in
+    // the slot (the fetch-constant patch, NOPs) update GPU state.
     //
     // M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`.
     // Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout
@@ -2345,11 +2408,19 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     let drained = state.gpu.drain_to_current_wptr(mem);
     tracing::debug!(drained, "VdSwap: drained PM4 packets");
 
-    // 3) Fire the swap notification — bumps `swaps_seen`, records
-    //    `last_swap`, enqueues an `InterruptSource::Swap` interrupt for
-    //    the scheduler-round graphics callback path. M1.5: backend-aware;
-    //    threaded sends `NotifyXeSwap` (fire-and-forget).
-    if frontbuffer_addr != 0 && width > 0 && height > 0 {
+    // Safety net: if the drain didn't consume PM4_XE_SWAP for any reason
+    // (ring address arithmetic edge case, threaded-backend timing), fall
+    // back to the direct notify so swaps don't go to zero. Idempotent —
+    // only fires when the PM4 path didn't bump the counter.
+    let post_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
+    if post_swap_counter == pre_swap_counter
+        && frontbuffer_addr != 0
+        && width > 0
+        && height > 0
+    {
+        tracing::warn!(
+            "VdSwap: PM4_XE_SWAP not consumed by drain (drained={drained}); falling back to direct notify"
+        );
         state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
     }
 
@@ -2539,6 +2610,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
         fmt = texture_format,
         cs = color_space,
         drained,
+        injected_dwords,
         buffer_ptr = format_args!("{buffer_ptr:#010x}"),
         fetch_ptr = format_args!("{fetch_ptr:#010x}"),
         "VdSwap complete"
diff --git a/crates/xenia-kernel/src/state.rs b/crates/xenia-kernel/src/state.rs
index c09dc63..04432f4 100644
--- a/crates/xenia-kernel/src/state.rs
+++ b/crates/xenia-kernel/src/state.rs
@@ -149,6 +149,12 @@ pub struct KernelState {
     /// `rtl_raise_exception` only emits once per run, regardless of how
     /// many subsequent throws fire. Reset on each fresh process start.
     pub cxx_throw_logged: bool,
+    /// Cached primary ring base/size, set during `VdInitializeRingBuffer`.
+    /// Used by `vd_swap` (KRNBUG-Vd-04) so the kernel can write PM4
+    /// packets directly into ring memory without going through the GPU
+    /// backend (which lives on the worker thread under `--gpu-thread`).
+    pub ring_base: u32,
+    pub ring_size_dwords: u32,
 }
 
 impl KernelState {
@@ -195,6 +201,8 @@ impl KernelState {
             reservations,
             thunks_by_ordinal: HashMap::new(),
             cxx_throw_logged: false,
+            ring_base: 0,
+            ring_size_dwords: 0,
         };
         crate::exports::register_exports(&mut state);
         crate::xam::register_exports(&mut state);