|
|
|
@@ -2999,53 +2999,83 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
|
|
|
// xboxkrnl_video.cc:479. Currently skipped (see below).
|
|
|
|
// xboxkrnl_video.cc:479. Currently skipped (see below).
|
|
|
|
let _ = fetch_dwords; // silence unused — will be live again under the deferred path
|
|
|
|
let _ = fetch_dwords; // silence unused — will be live again under the deferred path
|
|
|
|
|
|
|
|
|
|
|
|
// The original M2b path zero-filled buffer_ptr (in the system command
|
|
|
|
// iterate-2T: mirror xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:518-548)
|
|
|
|
// buffer) and bumped WPTR by 64 to expose the game's own ring writes.
|
|
|
|
// FAITHFULLY. The game reserves 64 dwords (256 bytes) in the primary ring
|
|
|
|
// Keep that untouched — the game still expects buffer_ptr to be a
|
|
|
|
// at `buffer_ptr`; canary writes a `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
|
|
|
// skippable scratch area, and the bump still exposes any game-batched
|
|
|
|
// fetch-constant patch followed by `PM4_TYPE3(PM4_XE_SWAP)`, then pads with
|
|
|
|
// PM4 packets for the drain.
|
|
|
|
// NOPs. We do the same, then bump WPTR by 64 so the drain consumes the
|
|
|
|
|
|
|
|
// PM4_XE_SWAP **in command-stream order** — i.e. AFTER any in-stream
|
|
|
|
|
|
|
|
// callback-arming Type-0 writes the game already queued.
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// Why this matters (the iterate-2T root): the previous M2b short-circuit
|
|
|
|
|
|
|
|
// called `notify_xe_swap` directly from the HLE, which synthesized a CP
|
|
|
|
|
|
|
|
// swap-complete interrupt OUT OF BAND. When that interrupt reached the
|
|
|
|
|
|
|
|
// graphics ISR (`sub_824BE9A0`) before D3D had armed its swap-callback
|
|
|
|
|
|
|
|
// slot (`[gfx+10772]+16` still the `0xBADF00D` placeholder), the ISR hit
|
|
|
|
|
|
|
|
// its "ERR[D3D]: Unanticipated CPU_INTERRUPT. Sign of a corrupt command
|
|
|
|
|
|
|
|
// buffer?" assert (`twi` at 0x824BE9DC). Routing the swap through the ring
|
|
|
|
|
|
|
|
// packet keeps the interrupt naturally ordered after arming, matching
|
|
|
|
|
|
|
|
// canary (whose VdSwap raises NO interrupt itself; swap-complete CP
|
|
|
|
|
|
|
|
// interrupts come only from in-stream `PM4_INTERRUPT` packets).
|
|
|
|
if buffer_ptr != 0 {
|
|
|
|
if buffer_ptr != 0 {
|
|
|
|
for i in 0..64u32 {
|
|
|
|
let mut off = 0u32;
|
|
|
|
mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
|
|
|
|
let mut put = |i: &mut u32, v: u32| {
|
|
|
|
|
|
|
|
mem.write_u32(buffer_ptr + *i * 4, v);
|
|
|
|
|
|
|
|
*i += 1;
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
// PM4_TYPE0 fetch-constant slot-0 patch (6 dwords payload). The
|
|
|
|
|
|
|
|
// base_address field is patched to the physical frontbuffer so the
|
|
|
|
|
|
|
|
// bloom/blur "sample frame N for frame N+1" path reads the right page.
|
|
|
|
|
|
|
|
let mut patched = fetch_dwords;
|
|
|
|
|
|
|
|
patched[1] = (patched[1] & 0x0000_0FFF) | ((frontbuffer_addr >> 12) << 12);
|
|
|
|
|
|
|
|
put(
|
|
|
|
|
|
|
|
&mut off,
|
|
|
|
|
|
|
|
xenia_gpu::pm4::make_packet_type0(
|
|
|
|
|
|
|
|
xenia_gpu::gpu_system::CONST_BASE_FETCH as u16,
|
|
|
|
|
|
|
|
6,
|
|
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
for d in patched {
|
|
|
|
|
|
|
|
put(&mut off, d);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// PM4_TYPE3(PM4_XE_SWAP, 4 dwords): signature, frontbuffer_phys, w, h.
|
|
|
|
|
|
|
|
put(
|
|
|
|
|
|
|
|
&mut off,
|
|
|
|
|
|
|
|
xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
|
|
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
put(&mut off, xenia_gpu::pm4::SWAP_SIGNATURE);
|
|
|
|
|
|
|
|
put(&mut off, frontbuffer_addr);
|
|
|
|
|
|
|
|
put(&mut off, width);
|
|
|
|
|
|
|
|
put(&mut off, height);
|
|
|
|
|
|
|
|
// Pad the remainder with NOP (Type-2) packets.
|
|
|
|
|
|
|
|
while off < 64 {
|
|
|
|
|
|
|
|
put(&mut off, xenia_gpu::pm4::make_packet_type2());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
state.gpu.extend_write_ptr_by(64);
|
|
|
|
state.gpu.extend_write_ptr_by(64);
|
|
|
|
|
|
|
|
|
|
|
|
// GPUBUG-DRAIN-001: notify the swap directly.
|
|
|
|
// Drain the ring; the PM4_XE_SWAP we just queued (and any in-stream
|
|
|
|
//
|
|
|
|
// PM4_INTERRUPT) executes in order. The PM4_XE_SWAP handler calls
|
|
|
|
// Per xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:438-521), the
|
|
|
|
// `notify_xe_swap` for host swap bookkeeping; no synthetic interrupt is
|
|
|
|
// textbook approach is to inject `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
|
|
|
// raised (see `notify_xe_swap`).
|
|
|
|
// (fetch-constant slot-0 patch for the Sylpheed bloom/blur "frame N+1"
|
|
|
|
|
|
|
|
// sample) followed by `PM4_TYPE3(PM4_XE_SWAP)` directly into the
|
|
|
|
|
|
|
|
// primary ring at WPTR, then let the natural drain consume them.
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// That works in **pure lockstep** (drain runs at every kernel callback
|
|
|
|
|
|
|
|
// boundary, ring has at most a few hundred packets pending). It
|
|
|
|
|
|
|
|
// **does not** work under `--parallel` (CPU + GPU ring contention) —
|
|
|
|
|
|
|
|
// observed empirically: vd_swap's `drain_to_current_wptr` consumes
|
|
|
|
|
|
|
|
// 8-10 million game-batched IB packets in the 900 ms inline-deadline
|
|
|
|
|
|
|
|
// window without reaching our tail-injected PM4_XE_SWAP. Under
|
|
|
|
|
|
|
|
// threaded backend the worker has the same deadline. Either:
|
|
|
|
|
|
|
|
// (a) the safety-net direct notify (below) fires and gets the swap
|
|
|
|
|
|
|
|
// counted — but if the worker *eventually* drains past our
|
|
|
|
|
|
|
|
// injected packet later it would double-count,
|
|
|
|
|
|
|
|
// (b) we extend the deadline so far that vd_swap blocks for many
|
|
|
|
|
|
|
|
// seconds — unreasonable for a kernel callback.
|
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
// Skip the ring injection unconditionally and post `notify_xe_swap`
|
|
|
|
|
|
|
|
// directly. The drain still runs (game packets execute as normal).
|
|
|
|
|
|
|
|
// **Trade-off**: the slot-0 fetch-constant patch is deferred —
|
|
|
|
|
|
|
|
// tracked as GPUBUG-FETCH-PATCH-001. Sylpheed currently has draws=0,
|
|
|
|
|
|
|
|
// so a stale slot 0 has no observable effect.
|
|
|
|
|
|
|
|
let drained = state.gpu.drain_to_current_wptr(mem);
|
|
|
|
let drained = state.gpu.drain_to_current_wptr(mem);
|
|
|
|
tracing::debug!(drained, "VdSwap: drained PM4 packets");
|
|
|
|
tracing::debug!(drained, "VdSwap: drained PM4 packets");
|
|
|
|
|
|
|
|
|
|
|
|
// Direct swap notification. Inline mode bumps `swaps_seen`
|
|
|
|
// Safety net: if the drain did NOT reach our PM4_XE_SWAP this call (e.g.
|
|
|
|
// synchronously; threaded mode posts a `GpuCommand::NotifyXeSwap`
|
|
|
|
// an undersized inline deadline left game-batched packets pending), still
|
|
|
|
// and the worker bumps it asynchronously.
|
|
|
|
// bump the host swap counter so the UI present + swap stats stay live.
|
|
|
|
|
|
|
|
// Skip when the in-stream PM4_XE_SWAP already recorded this frontbuffer
|
|
|
|
|
|
|
|
// (avoids double-counting). This path does NOT raise a CP interrupt.
|
|
|
|
if frontbuffer_addr != 0 && width > 0 && height > 0 {
|
|
|
|
if frontbuffer_addr != 0 && width > 0 && height > 0 {
|
|
|
|
|
|
|
|
let already_swapped = state
|
|
|
|
|
|
|
|
.gpu
|
|
|
|
|
|
|
|
.as_inline_mut()
|
|
|
|
|
|
|
|
.map(|g| g.last_swap.map(|s| s.frontbuffer_phys) == Some(frontbuffer_addr))
|
|
|
|
|
|
|
|
.unwrap_or(false);
|
|
|
|
|
|
|
|
if !already_swapped {
|
|
|
|
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
|
|
|
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// The remaining vd_swap work (UI publish: shader blobs, constants,
|
|
|
|
// The remaining vd_swap work (UI publish: shader blobs, constants,
|
|
|
|
// texture cache, frontbuffer detile, ui.notify_swap) reads
|
|
|
|
// texture cache, frontbuffer detile, ui.notify_swap) reads
|
|
|
|
|