Merge audit-2026-05-fix/p2-vdswap-parallel-fallback (GPUBUG-DRAIN-001)

This commit is contained in:
MechaCat02
2026-05-03 17:12:19 +02:00
3 changed files with 121 additions and 86 deletions

View File

@@ -18,6 +18,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::time::{Duration, Instant};
use xenia_memory::MemoryAccess; use xenia_memory::MemoryAccess;
@@ -1280,6 +1281,63 @@ impl GpuSystem {
} }
n n
} }
/// Drain until the ring's read offset reaches `target_wptr` (modulo ring
/// size) or `execute_one` returns Idle/Blocked. Mirrors canary's
/// `WorkerThreadMain` (xenia-canary `command_processor.cc` ExecutePrimaryBuffer)
/// which loops on `read_ptr_index_ != write_ptr_index` with no packet
/// budget. `time_budget` bounds wall-clock so a pathological packet
/// (e.g. an EVENT_WRITE that perpetually re-blocks) cannot spin the
/// inline path; pass 900 ms to match the threaded `DrainFence` deadline.
/// Returns the number of packets consumed.
pub fn drain_until_wptr(
&mut self,
mem: &dyn MemoryAccess,
target_wptr: u32,
time_budget: Duration,
) -> u32 {
if self.ring.size_dwords == 0 {
return 0;
}
let target = target_wptr % self.ring.size_dwords;
let deadline = Instant::now() + time_budget;
let mut n = 0u32;
while self.ring.read_offset_dwords != target {
if Instant::now() >= deadline {
// Deadline exhaustion is the *expected* outcome under
// `--parallel` workloads (Sylpheed boot queues millions
// of game-batched IBs the inline drain can't chew
// through in 900 ms). Logged at debug because warn-level
// would fire on every vd_swap. Callers can re-read the
// ring read pointer to detect partial drain if they
// care.
tracing::debug!(
target,
rptr = self.ring.read_offset_dwords,
consumed = n,
"gpu: drain_until_wptr time-budget exhausted"
);
break;
}
match self.execute_one(mem) {
ExecOutcome::Stepped { .. } => {
n += 1;
// Mirror the threaded `DrainFence` handler at
// handle.rs:553-570: re-sync after every packet so
// any concurrent guest WPTR write (under `--parallel`)
// folds into the local ring view before the next
// `is_ready` check. Without this the local
// write_offset is a snapshot of the moment we entered
// the drain, which is fine for a target-WPTR drain
// but wrong if downstream packets (e.g. an indirect
// buffer's nested ring) need an updated view.
self.sync_with_mmio();
}
ExecOutcome::Idle | ExecOutcome::Blocked => break,
}
}
n
}
} }
impl Default for GpuSystem { impl Default for GpuSystem {

View File

@@ -381,7 +381,16 @@ impl GpuBackend {
match self { match self {
GpuBackend::Inline(s) => { GpuBackend::Inline(s) => {
s.sync_with_mmio(); s.sync_with_mmio();
s.drain(mem, 4096) // GPUBUG-DRAIN-001: drain until target WPTR is reached (or
// 900 ms deadline), mirroring canary's `WorkerThreadMain` and
// the threaded `DrainFence` semantics. The previous fixed
// 4096-packet budget hit the cap under `--parallel`, where
// CPU runs many more PPC blocks per kernel-callback boundary
// and the ring backs up past 4096 packets before vd_swap
// fires; the safety-net fallback warning fired twice for
// each Sylpheed run.
let target = s.mmio.cp_rb_wptr.load(Ordering::Acquire);
s.drain_until_wptr(mem, target, Duration::from_millis(900))
} }
GpuBackend::Threaded(h) => { GpuBackend::Threaded(h) => {
let target_wptr = h.mmio.cp_rb_wptr.load(Ordering::Acquire); let target_wptr = h.mmio.cp_rb_wptr.load(Ordering::Acquire);
@@ -563,6 +572,23 @@ impl GpuWorker {
ExecOutcome::Idle | ExecOutcome::Blocked => break, ExecOutcome::Idle | ExecOutcome::Blocked => break,
} }
} }
// GPUBUG-DRAIN-001: publish the digest mirror BEFORE
// replying so the CPU's post-drain `digest_snapshot`
// observes the `swaps_seen` bump from any
// PM4_XE_SWAP we just consumed. Without this the
// outer-loop publish at step 5b races the CPU's
// post_swap_counter check and the kernel-side
// `vd_swap` fires the "PM4_XE_SWAP not consumed"
// safety-net warning even when the swap landed.
let snap = GpuDigestSnapshot {
stats: self.system.stats.clone(),
shader_blobs_live: self.system.shader_blobs.len() as u64,
texture_cache_entries: self.system.texture_cache.len() as u64,
texture_decodes: self.system.texture_cache.decodes_total,
};
if let Ok(mut g) = self.digest.lock() {
*g = snap;
}
let _ = reply_tx.send(()); let _ = reply_tx.send(());
} }
GpuCommand::NotifyXeSwap { GpuCommand::NotifyXeSwap {

View File

@@ -2370,30 +2370,13 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
0 0
}; };
// Patch the fetch header's base_address from virtual to physical so the // GPUBUG-FETCH-PATCH-001 (deferred): if/when the PM4_TYPE0 injection
// GPU's fetch-constant slot 0 sees the addressable frontbuffer. dword_1 // path is re-enabled, also patch `fetch_dwords[1]` here:
// high 20 bits hold base_address (>>12). Mirrors canary at // fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | ((frontbuffer_addr >> 12) << 12);
// xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:479. // That carries the slot-0 fetch-constant for the Sylpheed bloom/blur
if frontbuffer_addr != 0 { // "sample frame N for frame N+1" path. Mirrors `xenia-canary` at
let base_phys_shifted = frontbuffer_addr >> 12; // xboxkrnl_video.cc:479. Currently skipped (see below).
fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | (base_phys_shifted << 12); let _ = fetch_dwords; // silence unused — will be live again under the deferred path
}
// KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write a real PM4 sequence
// into the guest's reserved 64-dword ring slot, then let the natural
// CP drain consume PM4_XE_SWAP. The pre-fix path filled the slot with
// NOPs and called `notify_xe_swap` directly — bypassing the ring,
// leaving the PM4_XE_SWAP handler dead code, and skipping the
// fetch-constant-slot-0 patch that bloom/blur "sample frame N for
// frame N+1" paths depend on.
//
// Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
// 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0=0x4800, count=6) + 6 dwords
// (the patched fetch header).
// 2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys
// + width + height.
// 3) PM4_TYPE2 NOP fill to slot end (64 dwords total).
let pre_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
// The original M2b path zero-filled buffer_ptr (in the system command // The original M2b path zero-filled buffer_ptr (in the system command
// buffer) and bumped WPTR by 64 to expose the game's own ring writes. // buffer) and bumped WPTR by 64 to expose the game's own ring writes.
@@ -2407,70 +2390,39 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
} }
state.gpu.extend_write_ptr_by(64); state.gpu.extend_write_ptr_by(64);
// KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write the canary PM4 // GPUBUG-DRAIN-001: notify the swap directly.
// sequence directly into the primary ring at the current WPTR (after
// the buffer_ptr bump above), then advance WPTR over our injection.
// Mirrors xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
// 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) + 6 dwords of patched
// fetch header.
// 2) PM4_TYPE3(PM4_XE_SWAP, 4) + signature + frontbuffer_phys + W + H.
// The drain below picks these up via the natural CP path; the
// PM4_TYPE0 patches fetch-constant slot 0 with the frontbuffer's
// physical descriptor (Sylpheed's bloom/blur path samples this for
// frame N+1, so a stale slot 0 gives garbage).
let mut injected_dwords: u32 = 0;
if state.ring_base != 0 && state.ring_size_dwords != 0 {
const SHADER_CONSTANT_FETCH_00_0: u16 = 0x4800;
let packets: [u32; 12] = [
xenia_gpu::pm4::make_packet_type0(SHADER_CONSTANT_FETCH_00_0, 6),
fetch_dwords[0],
fetch_dwords[1],
fetch_dwords[2],
fetch_dwords[3],
fetch_dwords[4],
fetch_dwords[5],
xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
xenia_gpu::pm4::SWAP_SIGNATURE,
frontbuffer_addr,
width,
height,
];
// The MMIO WPTR is unmodulo'd; modulo by ring size to get the
// ring offset, then add base + offset*4 for the guest address.
let mmio_wptr = state.gpu.mmio_cp_rb_wptr_load();
for (i, dword) in packets.iter().enumerate() {
let ofs = (mmio_wptr.wrapping_add(i as u32)) % state.ring_size_dwords;
mem.write_u32(state.ring_base.wrapping_add(ofs.wrapping_mul(4)), *dword);
}
state.gpu.extend_write_ptr_by(packets.len() as u32);
injected_dwords = packets.len() as u32;
}
// Drain the exposed packets. The PM4_XE_SWAP we just wrote will be
// consumed by the existing handler at gpu_system.rs:1232, which calls
// `notify_xe_swap` itself — bumping `swaps_seen` and recording
// `last_swap` exactly as the old direct call did. Other packets in
// the slot (the fetch-constant patch, NOPs) update GPU state.
// //
// M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`. // Per xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:438-521), the
// Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout // textbook approach is to inject `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
// on CPU; 900 ms internal deadline on worker). // (fetch-constant slot-0 patch for the Sylpheed bloom/blur "frame N+1"
// sample) followed by `PM4_TYPE3(PM4_XE_SWAP)` directly into the
// primary ring at WPTR, then let the natural drain consume them.
//
// That works in **pure lockstep** (drain runs at every kernel callback
// boundary, ring has at most a few hundred packets pending). It
// **does not** work under `--parallel` (CPU + GPU ring contention) —
// observed empirically: vd_swap's `drain_to_current_wptr` consumes
// 8-10 million game-batched IB packets in the 900 ms inline-deadline
// window without reaching our tail-injected PM4_XE_SWAP. Under
// threaded backend the worker has the same deadline. Either:
// (a) the safety-net direct notify (below) fires and gets the swap
// counted — but if the worker *eventually* drains past our
// injected packet later it would double-count,
// (b) we extend the deadline so far that vd_swap blocks for many
// seconds — unreasonable for a kernel callback.
//
// Skip the ring injection unconditionally and post `notify_xe_swap`
// directly. The drain still runs (game packets execute as normal).
// **Trade-off**: the slot-0 fetch-constant patch is deferred —
// tracked as GPUBUG-FETCH-PATCH-001. Sylpheed currently has draws=0,
// so a stale slot 0 has no observable effect.
let drained = state.gpu.drain_to_current_wptr(mem); let drained = state.gpu.drain_to_current_wptr(mem);
tracing::debug!(drained, "VdSwap: drained PM4 packets"); tracing::debug!(drained, "VdSwap: drained PM4 packets");
// Safety net: if the drain didn't consume PM4_XE_SWAP for any reason // Direct swap notification. Inline mode bumps `swaps_seen`
// (ring address arithmetic edge case, threaded-backend timing), fall // synchronously; threaded mode posts a `GpuCommand::NotifyXeSwap`
// back to the direct notify so swaps don't go to zero. Idempotent — // and the worker bumps it asynchronously.
// only fires when the PM4 path didn't bump the counter. if frontbuffer_addr != 0 && width > 0 && height > 0 {
let post_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
if post_swap_counter == pre_swap_counter
&& frontbuffer_addr != 0
&& width > 0
&& height > 0
{
tracing::warn!(
"VdSwap: PM4_XE_SWAP not consumed by drain (drained={drained}); falling back to direct notify"
);
state.gpu.notify_xe_swap(frontbuffer_addr, width, height); state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
} }
@@ -2660,7 +2612,6 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
fmt = texture_format, fmt = texture_format,
cs = color_space, cs = color_space,
drained, drained,
injected_dwords,
buffer_ptr = format_args!("{buffer_ptr:#010x}"), buffer_ptr = format_args!("{buffer_ptr:#010x}"),
fetch_ptr = format_args!("{fetch_ptr:#010x}"), fetch_ptr = format_args!("{fetch_ptr:#010x}"),
"VdSwap complete" "VdSwap complete"