Merge audit-2026-05-fix/renderer-p0-vdswap-pm4: VdSwap PM4 ring path

2026-05-03 14:00:27 +02:00
parent 0590bffdd9 82f3d611e2
commit 1b74db6fa7
4 changed files with 176 additions and 34 deletions
--- a/crates/xenia-gpu/src/handle.rs
+++ b/crates/xenia-gpu/src/handle.rs
@@ -358,6 +358,14 @@ impl GpuBackend {
            .store(cur.wrapping_add(dwords), Ordering::Release);
    }
    /// Read the current ring write pointer from the shared MMIO atomic.
    /// Acquire-load so any prior ring-memory writes the caller did before
    /// asking are not reordered past this read by the compiler. Used by
    /// `vd_swap` to compute the ring offset for direct PM4 injection.
    pub fn mmio_cp_rb_wptr_load(&self) -> u32 {
        self.mmio().cp_rb_wptr.load(Ordering::Acquire)
    }
    /// Drain any PM4 packets currently exposed by the ring (i.e., up to
    /// the current `CP_RB_WPTR`). Inline mode runs the synchronous
    /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks
--- a/crates/xenia-gpu/src/pm4.rs
+++ b/crates/xenia-gpu/src/pm4.rs
@@ -128,6 +128,27 @@ pub enum PacketKind {
    },
 }
 /// Build a Type-0 register-write packet header. Mirrors canary's
 /// `MakePacketType0` at `xenia-canary/src/xenia/gpu/xenos.h:1682`.
 /// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
 pub fn make_packet_type0(reg_index: u16, count: u16) -> u32 {
    debug_assert!(reg_index <= 0x7FFF);
    debug_assert!(count >= 1 && count as u32 <= 0x4000);
    (0u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | (reg_index as u32 & 0x7FFF)
 }
 /// Build a Type-2 NOP packet header. Single dword, no payload.
 pub const fn make_packet_type2() -> u32 {
    2u32 << 30
 }
 /// Build a Type-3 command packet header. Mirrors canary's `MakePacketType3`.
 /// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
 pub fn make_packet_type3(opcode: u8, count: u16) -> u32 {
    debug_assert!(count >= 1 && count as u32 <= 0x4000);
    (3u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | ((opcode as u32 & 0x7F) << 8)
 }
 /// Decode a single PM4 packet header.
 pub fn decode(header: u32) -> PacketHeader {
    match header >> 30 {
@@ -229,4 +250,37 @@ mod tests {
        assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
        assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
    }
    #[test]
    fn make_packet_helpers_round_trip_through_decode() {
        // Type-0: SHADER_CONSTANT_FETCH_00_0 (0x4800), count=6.
        let t0 = make_packet_type0(0x4800, 6);
        match decode(t0).kind {
            PacketKind::Type0 { base_index, count, write_one } => {
                assert_eq!(base_index, 0x4800);
                assert_eq!(count, 6);
                assert!(!write_one);
            }
            other => panic!("expected Type0, got {other:?}"),
        }
        assert_eq!(decode(t0).total_dwords, 7);
        // Type-3: PM4_XE_SWAP, count=4 (signature + addr + W + H).
        let t3 = make_packet_type3(PM4_XE_SWAP, 4);
        match decode(t3).kind {
            PacketKind::Type3 { opcode, count, predicated } => {
                assert_eq!(opcode, PM4_XE_SWAP);
                assert_eq!(count, 4);
                assert!(!predicated);
            }
            other => panic!("expected Type3, got {other:?}"),
        }
        assert_eq!(decode(t3).total_dwords, 5);
        // Type-2: NOP.
        let t2 = make_packet_type2();
        assert_eq!(t2, 0x8000_0000);
        assert_eq!(decode(t2).kind, PacketKind::Type2);
        assert_eq!(decode(t2).total_dwords, 1);
    }
 }
--- a/crates/xenia-kernel/src/exports.rs
+++ b/crates/xenia-kernel/src/exports.rs
@@ -2210,6 +2210,14 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
    let ptr = ctx.gpr[3] as u32;
    let size_log2 = ctx.gpr[4] as u32;
    state.gpu.initialize_ring_buffer(ptr, size_log2);
    // Cache the ring layout on KernelState so `vd_swap` can write PM4
    // packets directly into ring memory at the current WPTR (the GPU
    // backend lives on a worker thread under `--gpu-thread` so we can't
    // read its `ring.base` from the kernel side without a channel hop).
    // Per canary: size_log2 is log2(size in BYTES), so size in dwords =
    // 2^size_log2 / 4 = 1 << (size_log2 - 2).
    state.ring_base = ptr;
    state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
    ctx.gpr[3] = 0;
 }
@@ -2287,15 +2295,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
    } else {
        (1280, 720)
    };
-    // The guest also writes the virtual frontbuffer address to *frontbuffer_ptr.
+    // Translate frontbuffer virtual → physical. Per canary VdSwap_entry
-    // Prefer that when the fetch-derived address is zero.
+    // (xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:468-471),
-    let frontbuffer_addr = if frontbuffer_virt != 0 {
+    // the GPU consumes physical addresses; the fetch header carries a
    // virtual address. KRNBUG-Mm-04: our MmGetPhysicalAddress is a masked
    // stub; a `virt & 0x1FFF_FFFF` is the equivalent translation today.
    let phys_mask: u32 = 0x1FFF_FFFF;
    let frontbuffer_addr_virt = if frontbuffer_virt != 0 {
        frontbuffer_virt
    } else if frontbuffer_ptr != 0 {
        mem.read_u32(frontbuffer_ptr)
    } else {
        0
    };
    let frontbuffer_addr = frontbuffer_addr_virt & phys_mask;
    let texture_format = if texture_format_ptr != 0 {
        mem.read_u32(texture_format_ptr)
    } else {
@@ -2307,37 +2320,87 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
        0
    };
-    // First-Pixels M2b — two-part commit path.
+    // Patch the fetch header's base_address from virtual to physical so the
-    //
+    // GPU's fetch-constant slot 0 sees the addressable frontbuffer. dword_1
-    // 1) Fill the guest's reserved 64-dword slot with PM4 Type-2 NOPs
+    // high 20 bits hold base_address (>>12). Mirrors canary at
-    //    (0x8000_0000). Some titles consume `buffer_ptr..+256` after
+    // xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:479.
-    //    VdSwap returns and assume they're skippable. Matches the prior
+    if frontbuffer_addr != 0 {
-    //    behaviour.
+        let base_phys_shifted = frontbuffer_addr >> 12;
-    if buffer_ptr != 0 {
+        fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | (base_phys_shifted << 12);
        for i in 0..64u32 {
            mem.write_u32(buffer_ptr + i * 4, 0x8000_0000);
        }
    }
-    // 2) Advance the ring's write pointer by 64 dwords (the slot the
+    // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write a real PM4 sequence
-    //    game "reserved" via VdSwap's buffer_ptr convention). Despite
+    // into the guest's reserved 64-dword ring slot, then let the natural
-    //    `buffer_ptr` being in the system command buffer rather than the
+    // CP drain consume PM4_XE_SWAP. The pre-fix path filled the slot with
-    //    primary ring, the 64-dword bump correctly exposes packets the
+    // NOPs and called `notify_xe_swap` directly — bypassing the ring,
-    //    game wrote into the primary ring since our last `sync_with_mmio`.
+    // leaving the PM4_XE_SWAP handler dead code, and skipping the
-    //    Empirically (pre-M2b) this path drained 512 packets through 1 B
+    // fetch-constant-slot-0 patch that bloom/blur "sample frame N for
-    //    guest instructions — the setup packets that D3D9-init writes.
+    // frame N+1" paths depend on.
    //
-    //    M1.5: bump routes through the shared MMIO atomic so both
+    // Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
-    //    backends produce the same observable WPTR sequence. Inline
+    //   1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0=0x4800, count=6) + 6 dwords
-    //    picks it up on its next `sync_with_mmio`; threaded's worker
+    //      (the patched fetch header).
-    //    observes the same atomic.
+    //   2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys
    //      + width + height.
    //   3) PM4_TYPE2 NOP fill to slot end (64 dwords total).
    let pre_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
    // The original M2b path zero-filled buffer_ptr (in the system command
    // buffer) and bumped WPTR by 64 to expose the game's own ring writes.
    // Keep that untouched — the game still expects buffer_ptr to be a
    // skippable scratch area, and the bump still exposes any game-batched
    // PM4 packets for the drain.
    if buffer_ptr != 0 {
        for i in 0..64u32 {
            mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
        }
    }
    state.gpu.extend_write_ptr_by(64);
-    // Drain the exposed packets — the D3D9-init setup the game batched
+    // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write the canary PM4
-    // into the ring plus any leftovers. The synthetic `PM4_XE_SWAP`
+    // sequence directly into the primary ring at the current WPTR (after
-    // packet the prior code wrote at `buffer_ptr` is **not** written
+    // the buffer_ptr bump above), then advance WPTR over our injection.
-    // anymore; the drain's `ring.base + rptr*4` walk couldn't find it
+    // Mirrors xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
-    // anyway (see the pre-M2b `swaps=0 with packets=512` failure mode).
+    //   1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) + 6 dwords of patched
    //      fetch header.
    //   2) PM4_TYPE3(PM4_XE_SWAP, 4) + signature + frontbuffer_phys + W + H.
    // The drain below picks these up via the natural CP path; the
    // PM4_TYPE0 patches fetch-constant slot 0 with the frontbuffer's
    // physical descriptor (Sylpheed's bloom/blur path samples this for
    // frame N+1, so a stale slot 0 gives garbage).
    let mut injected_dwords: u32 = 0;
    if state.ring_base != 0 && state.ring_size_dwords != 0 {
        const SHADER_CONSTANT_FETCH_00_0: u16 = 0x4800;
        let packets: [u32; 12] = [
            xenia_gpu::pm4::make_packet_type0(SHADER_CONSTANT_FETCH_00_0, 6),
            fetch_dwords[0],
            fetch_dwords[1],
            fetch_dwords[2],
            fetch_dwords[3],
            fetch_dwords[4],
            fetch_dwords[5],
            xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
            xenia_gpu::pm4::SWAP_SIGNATURE,
            frontbuffer_addr,
            width,
            height,
        ];
        // The MMIO WPTR is unmodulo'd; modulo by ring size to get the
        // ring offset, then add base + offset*4 for the guest address.
        let mmio_wptr = state.gpu.mmio_cp_rb_wptr_load();
        for (i, dword) in packets.iter().enumerate() {
            let ofs = (mmio_wptr.wrapping_add(i as u32)) % state.ring_size_dwords;
            mem.write_u32(state.ring_base.wrapping_add(ofs.wrapping_mul(4)), *dword);
        }
        state.gpu.extend_write_ptr_by(packets.len() as u32);
        injected_dwords = packets.len() as u32;
    }
    // Drain the exposed packets. The PM4_XE_SWAP we just wrote will be
    // consumed by the existing handler at gpu_system.rs:1232, which calls
    // `notify_xe_swap` itself — bumping `swaps_seen` and recording
    // `last_swap` exactly as the old direct call did. Other packets in
    // the slot (the fetch-constant patch, NOPs) update GPU state.
    //
    // M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`.
    // Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout
@@ -2345,11 +2408,19 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
    let drained = state.gpu.drain_to_current_wptr(mem);
    tracing::debug!(drained, "VdSwap: drained PM4 packets");
-    // 3) Fire the swap notification — bumps `swaps_seen`, records
+    // Safety net: if the drain didn't consume PM4_XE_SWAP for any reason
-    //    `last_swap`, enqueues an `InterruptSource::Swap` interrupt for
+    // (ring address arithmetic edge case, threaded-backend timing), fall
-    //    the scheduler-round graphics callback path. M1.5: backend-aware;
+    // back to the direct notify so swaps don't go to zero. Idempotent —
-    //    threaded sends `NotifyXeSwap` (fire-and-forget).
+    // only fires when the PM4 path didn't bump the counter.
-    if frontbuffer_addr != 0 && width > 0 && height > 0 {
+    let post_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
    if post_swap_counter == pre_swap_counter
        && frontbuffer_addr != 0
        && width > 0
        && height > 0
    {
        tracing::warn!(
            "VdSwap: PM4_XE_SWAP not consumed by drain (drained={drained}); falling back to direct notify"
        );
        state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
    }
@@ -2539,6 +2610,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
        fmt = texture_format,
        cs = color_space,
        drained,
        injected_dwords,
        buffer_ptr = format_args!("{buffer_ptr:#010x}"),
        fetch_ptr = format_args!("{fetch_ptr:#010x}"),
        "VdSwap complete"
--- a/crates/xenia-kernel/src/state.rs
+++ b/crates/xenia-kernel/src/state.rs
@@ -149,6 +149,12 @@ pub struct KernelState {
    /// `rtl_raise_exception` only emits once per run, regardless of how
    /// many subsequent throws fire. Reset on each fresh process start.
    pub cxx_throw_logged: bool,
    /// Cached primary ring base/size, set during `VdInitializeRingBuffer`.
    /// Used by `vd_swap` (KRNBUG-Vd-04) so the kernel can write PM4
    /// packets directly into ring memory without going through the GPU
    /// backend (which lives on the worker thread under `--gpu-thread`).
    pub ring_base: u32,
    pub ring_size_dwords: u32,
 }
 impl KernelState {
@@ -195,6 +201,8 @@ impl KernelState {
            reservations,
            thunks_by_ordinal: HashMap::new(),
            cxx_throw_logged: false,
            ring_base: 0,
            ring_size_dwords: 0,
        };
        crate::exports::register_exports(&mut state);
        crate::xam::register_exports(&mut state);