diff --git a/crates/xenia-gpu/src/handle.rs b/crates/xenia-gpu/src/handle.rs
index 4205f77..86607cf 100644
--- a/crates/xenia-gpu/src/handle.rs
+++ b/crates/xenia-gpu/src/handle.rs
@@ -358,6 +358,14 @@ impl GpuBackend {
             .store(cur.wrapping_add(dwords), Ordering::Release);
     }
 
+    /// Read the current ring write pointer from the shared MMIO atomic.
+    /// Acquire-load so any prior ring-memory writes the caller did before
+    /// asking are not reordered past this read by the compiler. Used by
+    /// `vd_swap` to compute the ring offset for direct PM4 injection.
+    pub fn mmio_cp_rb_wptr_load(&self) -> u32 {
+        self.mmio().cp_rb_wptr.load(Ordering::Acquire)
+    }
+
     /// Drain any PM4 packets currently exposed by the ring (i.e., up to
     /// the current `CP_RB_WPTR`). Inline mode runs the synchronous
     /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks
diff --git a/crates/xenia-gpu/src/pm4.rs b/crates/xenia-gpu/src/pm4.rs
index 370b0ba..bb95eef 100644
--- a/crates/xenia-gpu/src/pm4.rs
+++ b/crates/xenia-gpu/src/pm4.rs
@@ -128,6 +128,27 @@ pub enum PacketKind {
     },
 }
 
+/// Build a Type-0 register-write packet header. Mirrors canary's
+/// `MakePacketType0` at `xenia-canary/src/xenia/gpu/xenos.h:1682`.
+/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
+pub fn make_packet_type0(reg_index: u16, count: u16) -> u32 {
+    debug_assert!(reg_index <= 0x7FFF);
+    debug_assert!(count >= 1 && count as u32 <= 0x4000);
+    (0u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | (reg_index as u32 & 0x7FFF)
+}
+
+/// Build a Type-2 NOP packet header. Single dword, no payload.
+pub const fn make_packet_type2() -> u32 {
+    2u32 << 30
+}
+
+/// Build a Type-3 command packet header. Mirrors canary's `MakePacketType3`.
+/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
+pub fn make_packet_type3(opcode: u8, count: u16) -> u32 {
+    debug_assert!(count >= 1 && count as u32 <= 0x4000);
+    (3u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | ((opcode as u32 & 0x7F) << 8)
+}
+
 /// Decode a single PM4 packet header.
 pub fn decode(header: u32) -> PacketHeader {
     match header >> 30 {
@@ -229,4 +250,37 @@ mod tests {
         assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
         assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
     }
+
+    #[test]
+    fn make_packet_helpers_round_trip_through_decode() {
+        // Type-0: SHADER_CONSTANT_FETCH_00_0 (0x4800), count=6.
+        let t0 = make_packet_type0(0x4800, 6);
+        match decode(t0).kind {
+            PacketKind::Type0 { base_index, count, write_one } => {
+                assert_eq!(base_index, 0x4800);
+                assert_eq!(count, 6);
+                assert!(!write_one);
+            }
+            other => panic!("expected Type0, got {other:?}"),
+        }
+        assert_eq!(decode(t0).total_dwords, 7);
+
+        // Type-3: PM4_XE_SWAP, count=4 (signature + addr + W + H).
+        let t3 = make_packet_type3(PM4_XE_SWAP, 4);
+        match decode(t3).kind {
+            PacketKind::Type3 { opcode, count, predicated } => {
+                assert_eq!(opcode, PM4_XE_SWAP);
+                assert_eq!(count, 4);
+                assert!(!predicated);
+            }
+            other => panic!("expected Type3, got {other:?}"),
+        }
+        assert_eq!(decode(t3).total_dwords, 5);
+
+        // Type-2: NOP.
+        let t2 = make_packet_type2();
+        assert_eq!(t2, 0x8000_0000);
+        assert_eq!(decode(t2).kind, PacketKind::Type2);
+        assert_eq!(decode(t2).total_dwords, 1);
+    }
 }
diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs
index 1e570a3..a4f34af 100644
--- a/crates/xenia-kernel/src/exports.rs
+++ b/crates/xenia-kernel/src/exports.rs
@@ -2210,6 +2210,14 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
     let ptr = ctx.gpr[3] as u32;
     let size_log2 = ctx.gpr[4] as u32;
     state.gpu.initialize_ring_buffer(ptr, size_log2);
+    // Cache the ring layout on KernelState so `vd_swap` can write PM4
+    // packets directly into ring memory at the current WPTR (the GPU
+    // backend lives on a worker thread under `--gpu-thread` so we can't
+    // read its `ring.base` from the kernel side without a channel hop).
+    // Per canary: size_log2 is log2(size in BYTES), so size in dwords =
+    // 2^size_log2 / 4 = 1 << (size_log2 - 2).
+    state.ring_base = ptr;
+    state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
     ctx.gpr[3] = 0;
 }
 
@@ -2287,15 +2295,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     } else {
         (1280, 720)
     };
-    // The guest also writes the virtual frontbuffer address to *frontbuffer_ptr.
-    // Prefer that when the fetch-derived address is zero.
-    let frontbuffer_addr = if frontbuffer_virt != 0 {
+    // Translate frontbuffer virtual → physical. Per canary VdSwap_entry
+    // (xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:468-471),
+    // the GPU consumes physical addresses; the fetch header carries a
+    // virtual address. KRNBUG-Mm-04: our MmGetPhysicalAddress is a masked
+    // stub; a `virt & 0x1FFF_FFFF` is the equivalent translation today.
+    let phys_mask: u32 = 0x1FFF_FFFF;
+    let frontbuffer_addr_virt = if frontbuffer_virt != 0 {
         frontbuffer_virt
     } else if frontbuffer_ptr != 0 {
         mem.read_u32(frontbuffer_ptr)
     } else {
         0
     };
+    let frontbuffer_addr = frontbuffer_addr_virt & phys_mask;
     let texture_format = if texture_format_ptr != 0 {
         mem.read_u32(texture_format_ptr)
     } else {
@@ -2307,37 +2320,87 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
         0
     };
 
-    // First-Pixels M2b — two-part commit path.
-    //
-    // 1) Fill the guest's reserved 64-dword slot with PM4 Type-2 NOPs
-    //    (0x8000_0000). Some titles consume `buffer_ptr..+256` after
-    //    VdSwap returns and assume they're skippable. Matches the prior
-    //    behaviour.
-    if buffer_ptr != 0 {
-        for i in 0..64u32 {
-            mem.write_u32(buffer_ptr + i * 4, 0x8000_0000);
-        }
+    // Patch the fetch header's base_address from virtual to physical so the
+    // GPU's fetch-constant slot 0 sees the addressable frontbuffer. dword_1
+    // high 20 bits hold base_address (>>12). Mirrors canary at
+    // xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:479.
+    if frontbuffer_addr != 0 {
+        let base_phys_shifted = frontbuffer_addr >> 12;
+        fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | (base_phys_shifted << 12);
     }
 
-    // 2) Advance the ring's write pointer by 64 dwords (the slot the
-    //    game "reserved" via VdSwap's buffer_ptr convention). Despite
-    //    `buffer_ptr` being in the system command buffer rather than the
-    //    primary ring, the 64-dword bump correctly exposes packets the
-    //    game wrote into the primary ring since our last `sync_with_mmio`.
-    //    Empirically (pre-M2b) this path drained 512 packets through 1 B
-    //    guest instructions — the setup packets that D3D9-init writes.
+    // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write a real PM4 sequence
+    // into the guest's reserved 64-dword ring slot, then let the natural
+    // CP drain consume PM4_XE_SWAP. The pre-fix path filled the slot with
+    // NOPs and called `notify_xe_swap` directly — bypassing the ring,
+    // leaving the PM4_XE_SWAP handler dead code, and skipping the
+    // fetch-constant-slot-0 patch that bloom/blur "sample frame N for
+    // frame N+1" paths depend on.
     //
-    //    M1.5: bump routes through the shared MMIO atomic so both
-    //    backends produce the same observable WPTR sequence. Inline
-    //    picks it up on its next `sync_with_mmio`; threaded's worker
-    //    observes the same atomic.
+    // Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
+    //   1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0=0x4800, count=6) + 6 dwords
+    //      (the patched fetch header).
+    //   2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys
+    //      + width + height.
+    //   3) PM4_TYPE2 NOP fill to slot end (64 dwords total).
+    let pre_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
+
+    // The original M2b path zero-filled buffer_ptr (in the system command
+    // buffer) and bumped WPTR by 64 to expose the game's own ring writes.
+    // Keep that untouched — the game still expects buffer_ptr to be a
+    // skippable scratch area, and the bump still exposes any game-batched
+    // PM4 packets for the drain.
+    if buffer_ptr != 0 {
+        for i in 0..64u32 {
+            mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
+        }
+    }
     state.gpu.extend_write_ptr_by(64);
 
-    // Drain the exposed packets — the D3D9-init setup the game batched
-    // into the ring plus any leftovers. The synthetic `PM4_XE_SWAP`
-    // packet the prior code wrote at `buffer_ptr` is **not** written
-    // anymore; the drain's `ring.base + rptr*4` walk couldn't find it
-    // anyway (see the pre-M2b `swaps=0 with packets=512` failure mode).
+    // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write the canary PM4
+    // sequence directly into the primary ring at the current WPTR (after
+    // the buffer_ptr bump above), then advance WPTR over our injection.
+    // Mirrors xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
+    //   1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) + 6 dwords of patched
+    //      fetch header.
+    //   2) PM4_TYPE3(PM4_XE_SWAP, 4) + signature + frontbuffer_phys + W + H.
+    // The drain below picks these up via the natural CP path; the
+    // PM4_TYPE0 patches fetch-constant slot 0 with the frontbuffer's
+    // physical descriptor (Sylpheed's bloom/blur path samples this for
+    // frame N+1, so a stale slot 0 gives garbage).
+    let mut injected_dwords: u32 = 0;
+    if state.ring_base != 0 && state.ring_size_dwords != 0 {
+        const SHADER_CONSTANT_FETCH_00_0: u16 = 0x4800;
+        let packets: [u32; 12] = [
+            xenia_gpu::pm4::make_packet_type0(SHADER_CONSTANT_FETCH_00_0, 6),
+            fetch_dwords[0],
+            fetch_dwords[1],
+            fetch_dwords[2],
+            fetch_dwords[3],
+            fetch_dwords[4],
+            fetch_dwords[5],
+            xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
+            xenia_gpu::pm4::SWAP_SIGNATURE,
+            frontbuffer_addr,
+            width,
+            height,
+        ];
+        // The MMIO WPTR is unmodulo'd; modulo by ring size to get the
+        // ring offset, then add base + offset*4 for the guest address.
+        let mmio_wptr = state.gpu.mmio_cp_rb_wptr_load();
+        for (i, dword) in packets.iter().enumerate() {
+            let ofs = (mmio_wptr.wrapping_add(i as u32)) % state.ring_size_dwords;
+            mem.write_u32(state.ring_base.wrapping_add(ofs.wrapping_mul(4)), *dword);
+        }
+        state.gpu.extend_write_ptr_by(packets.len() as u32);
+        injected_dwords = packets.len() as u32;
+    }
+
+    // Drain the exposed packets. The PM4_XE_SWAP we just wrote will be
+    // consumed by the existing handler at gpu_system.rs:1232, which calls
+    // `notify_xe_swap` itself — bumping `swaps_seen` and recording
+    // `last_swap` exactly as the old direct call did. Other packets in
+    // the slot (the fetch-constant patch, NOPs) update GPU state.
     //
     // M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`.
     // Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout
@@ -2345,11 +2408,19 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
     let drained = state.gpu.drain_to_current_wptr(mem);
     tracing::debug!(drained, "VdSwap: drained PM4 packets");
 
-    // 3) Fire the swap notification — bumps `swaps_seen`, records
-    //    `last_swap`, enqueues an `InterruptSource::Swap` interrupt for
-    //    the scheduler-round graphics callback path. M1.5: backend-aware;
-    //    threaded sends `NotifyXeSwap` (fire-and-forget).
-    if frontbuffer_addr != 0 && width > 0 && height > 0 {
+    // Safety net: if the drain didn't consume PM4_XE_SWAP for any reason
+    // (ring address arithmetic edge case, threaded-backend timing), fall
+    // back to the direct notify so swaps don't go to zero. Idempotent —
+    // only fires when the PM4 path didn't bump the counter.
+    let post_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
+    if post_swap_counter == pre_swap_counter
+        && frontbuffer_addr != 0
+        && width > 0
+        && height > 0
+    {
+        tracing::warn!(
+            "VdSwap: PM4_XE_SWAP not consumed by drain (drained={drained}); falling back to direct notify"
+        );
         state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
     }
 
@@ -2539,6 +2610,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
         fmt = texture_format,
         cs = color_space,
         drained,
+        injected_dwords,
         buffer_ptr = format_args!("{buffer_ptr:#010x}"),
         fetch_ptr = format_args!("{fetch_ptr:#010x}"),
         "VdSwap complete"
diff --git a/crates/xenia-kernel/src/state.rs b/crates/xenia-kernel/src/state.rs
index c09dc63..04432f4 100644
--- a/crates/xenia-kernel/src/state.rs
+++ b/crates/xenia-kernel/src/state.rs
@@ -149,6 +149,12 @@ pub struct KernelState {
     /// `rtl_raise_exception` only emits once per run, regardless of how
     /// many subsequent throws fire. Reset on each fresh process start.
     pub cxx_throw_logged: bool,
+    /// Cached primary ring base/size, set during `VdInitializeRingBuffer`.
+    /// Used by `vd_swap` (KRNBUG-Vd-04) so the kernel can write PM4
+    /// packets directly into ring memory without going through the GPU
+    /// backend (which lives on the worker thread under `--gpu-thread`).
+    pub ring_base: u32,
+    pub ring_size_dwords: u32,
 }
 
 impl KernelState {
@@ -195,6 +201,8 @@ impl KernelState {
             reservations,
             thunks_by_ordinal: HashMap::new(),
             cxx_throw_logged: false,
+            ring_base: 0,
+            ring_size_dwords: 0,
         };
         crate::exports::register_exports(&mut state);
         crate::xam::register_exports(&mut state);