diff --git a/crates/xenia-app/tests/golden/sylpheed_n50m.json b/crates/xenia-app/tests/golden/sylpheed_n50m.json index 39dbaa8..7c165e7 100644 --- a/crates/xenia-app/tests/golden/sylpheed_n50m.json +++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json @@ -1,9 +1,9 @@ { - "instructions": 50000013, - "imports": 451497, + "instructions": 50000001, + "imports": 451500, "unimpl": 0, "draws": 78, - "swaps": 3, + "swaps": 4, "unique_render_targets": 2, "shader_blobs_live": 3, "texture_cache_entries": 0 diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs index fc254ce..18b0e70 100644 --- a/crates/xenia-gpu/src/gpu_system.rs +++ b/crates/xenia-gpu/src/gpu_system.rs @@ -726,10 +726,13 @@ impl GpuSystem { width, height, }); - self.pending_interrupts.push(PendingInterrupt { - source: InterruptSource::Swap, - cpu_mask: 0x1, - }); + // iterate-2T: do NOT raise a CP swap-complete interrupt here. Canary's + // `VdSwap`/PM4_XE_SWAP path raises no interrupt; swap-complete CP + // interrupts come ONLY from in-stream `PM4_INTERRUPT` packets, which + // are naturally ordered after D3D has armed the swap-callback slot. + // Synthesizing one out of band (as we did pre-2T) delivered a CP + // interrupt while the slot still held the `0xBADF00D` placeholder, + // tripping the graphics ISR's "Unanticipated CPU_INTERRUPT" assert. tracing::info!( frame = self.swap_counter, fb = format_args!("{frontbuffer_phys:#010x}"), diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs index 7632cf1..df89131 100644 --- a/crates/xenia-kernel/src/exports.rs +++ b/crates/xenia-kernel/src/exports.rs @@ -2999,52 +2999,82 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { // xboxkrnl_video.cc:479. Currently skipped (see below). let _ = fetch_dwords; // silence unused — will be live again under the deferred path - // The original M2b path zero-filled buffer_ptr (in the system command - // buffer) and bumped WPTR by 64 to expose the game's own ring writes. - // Keep that untouched — the game still expects buffer_ptr to be a - // skippable scratch area, and the bump still exposes any game-batched - // PM4 packets for the drain. + // iterate-2T: mirror xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:518-548) + // FAITHFULLY. The game reserves 64 dwords (256 bytes) in the primary ring + // at `buffer_ptr`; canary writes a `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)` + // fetch-constant patch followed by `PM4_TYPE3(PM4_XE_SWAP)`, then pads with + // NOPs. We do the same, then bump WPTR by 64 so the drain consumes the + // PM4_XE_SWAP **in command-stream order** — i.e. AFTER any in-stream + // callback-arming Type-0 writes the game already queued. + // + // Why this matters (the iterate-2T root): the previous M2b short-circuit + // called `notify_xe_swap` directly from the HLE, which synthesized a CP + // swap-complete interrupt OUT OF BAND. When that interrupt reached the + // graphics ISR (`sub_824BE9A0`) before D3D had armed its swap-callback + // slot (`[gfx+10772]+16` still the `0xBADF00D` placeholder), the ISR hit + // its "ERR[D3D]: Unanticipated CPU_INTERRUPT. Sign of a corrupt command + // buffer?" assert (`twi` at 0x824BE9DC). Routing the swap through the ring + // packet keeps the interrupt naturally ordered after arming, matching + // canary (whose VdSwap raises NO interrupt itself; swap-complete CP + // interrupts come only from in-stream `PM4_INTERRUPT` packets). if buffer_ptr != 0 { - for i in 0..64u32 { - mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2()); + let mut off = 0u32; + let mut put = |i: &mut u32, v: u32| { + mem.write_u32(buffer_ptr + *i * 4, v); + *i += 1; + }; + // PM4_TYPE0 fetch-constant slot-0 patch (6 dwords payload). The + // base_address field is patched to the physical frontbuffer so the + // bloom/blur "sample frame N for frame N+1" path reads the right page. + let mut patched = fetch_dwords; + patched[1] = (patched[1] & 0x0000_0FFF) | ((frontbuffer_addr >> 12) << 12); + put( + &mut off, + xenia_gpu::pm4::make_packet_type0( + xenia_gpu::gpu_system::CONST_BASE_FETCH as u16, + 6, + ), + ); + for d in patched { + put(&mut off, d); + } + // PM4_TYPE3(PM4_XE_SWAP, 4 dwords): signature, frontbuffer_phys, w, h. + put( + &mut off, + xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4), + ); + put(&mut off, xenia_gpu::pm4::SWAP_SIGNATURE); + put(&mut off, frontbuffer_addr); + put(&mut off, width); + put(&mut off, height); + // Pad the remainder with NOP (Type-2) packets. + while off < 64 { + put(&mut off, xenia_gpu::pm4::make_packet_type2()); } } state.gpu.extend_write_ptr_by(64); - // GPUBUG-DRAIN-001: notify the swap directly. - // - // Per xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:438-521), the - // textbook approach is to inject `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)` - // (fetch-constant slot-0 patch for the Sylpheed bloom/blur "frame N+1" - // sample) followed by `PM4_TYPE3(PM4_XE_SWAP)` directly into the - // primary ring at WPTR, then let the natural drain consume them. - // - // That works in **pure lockstep** (drain runs at every kernel callback - // boundary, ring has at most a few hundred packets pending). It - // **does not** work under `--parallel` (CPU + GPU ring contention) — - // observed empirically: vd_swap's `drain_to_current_wptr` consumes - // 8-10 million game-batched IB packets in the 900 ms inline-deadline - // window without reaching our tail-injected PM4_XE_SWAP. Under - // threaded backend the worker has the same deadline. Either: - // (a) the safety-net direct notify (below) fires and gets the swap - // counted — but if the worker *eventually* drains past our - // injected packet later it would double-count, - // (b) we extend the deadline so far that vd_swap blocks for many - // seconds — unreasonable for a kernel callback. - // - // Skip the ring injection unconditionally and post `notify_xe_swap` - // directly. The drain still runs (game packets execute as normal). - // **Trade-off**: the slot-0 fetch-constant patch is deferred — - // tracked as GPUBUG-FETCH-PATCH-001. Sylpheed currently has draws=0, - // so a stale slot 0 has no observable effect. + // Drain the ring; the PM4_XE_SWAP we just queued (and any in-stream + // PM4_INTERRUPT) executes in order. The PM4_XE_SWAP handler calls + // `notify_xe_swap` for host swap bookkeeping; no synthetic interrupt is + // raised (see `notify_xe_swap`). let drained = state.gpu.drain_to_current_wptr(mem); tracing::debug!(drained, "VdSwap: drained PM4 packets"); - // Direct swap notification. Inline mode bumps `swaps_seen` - // synchronously; threaded mode posts a `GpuCommand::NotifyXeSwap` - // and the worker bumps it asynchronously. + // Safety net: if the drain did NOT reach our PM4_XE_SWAP this call (e.g. + // an undersized inline deadline left game-batched packets pending), still + // bump the host swap counter so the UI present + swap stats stay live. + // Skip when the in-stream PM4_XE_SWAP already recorded this frontbuffer + // (avoids double-counting). This path does NOT raise a CP interrupt. if frontbuffer_addr != 0 && width > 0 && height > 0 { - state.gpu.notify_xe_swap(frontbuffer_addr, width, height); + let already_swapped = state + .gpu + .as_inline_mut() + .map(|g| g.last_swap.map(|s| s.frontbuffer_phys) == Some(frontbuffer_addr)) + .unwrap_or(false); + if !already_swapped { + state.gpu.notify_xe_swap(frontbuffer_addr, width, height); + } } // The remaining vd_swap work (UI publish: shader blobs, constants,