diff --git a/crates/xenia-gpu/src/handle.rs b/crates/xenia-gpu/src/handle.rs index 4205f77..86607cf 100644 --- a/crates/xenia-gpu/src/handle.rs +++ b/crates/xenia-gpu/src/handle.rs @@ -358,6 +358,14 @@ impl GpuBackend { .store(cur.wrapping_add(dwords), Ordering::Release); } + /// Read the current ring write pointer from the shared MMIO atomic. + /// Acquire-load so any prior ring-memory writes the caller did before + /// asking are not reordered past this read by the compiler. Used by + /// `vd_swap` to compute the ring offset for direct PM4 injection. + pub fn mmio_cp_rb_wptr_load(&self) -> u32 { + self.mmio().cp_rb_wptr.load(Ordering::Acquire) + } + /// Drain any PM4 packets currently exposed by the ring (i.e., up to /// the current `CP_RB_WPTR`). Inline mode runs the synchronous /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks diff --git a/crates/xenia-gpu/src/pm4.rs b/crates/xenia-gpu/src/pm4.rs index 370b0ba..bb95eef 100644 --- a/crates/xenia-gpu/src/pm4.rs +++ b/crates/xenia-gpu/src/pm4.rs @@ -128,6 +128,27 @@ pub enum PacketKind { }, } +/// Build a Type-0 register-write packet header. Mirrors canary's +/// `MakePacketType0` at `xenia-canary/src/xenia/gpu/xenos.h:1682`. +/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000). +pub fn make_packet_type0(reg_index: u16, count: u16) -> u32 { + debug_assert!(reg_index <= 0x7FFF); + debug_assert!(count >= 1 && count as u32 <= 0x4000); + (0u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | (reg_index as u32 & 0x7FFF) +} + +/// Build a Type-2 NOP packet header. Single dword, no payload. +pub const fn make_packet_type2() -> u32 { + 2u32 << 30 +} + +/// Build a Type-3 command packet header. Mirrors canary's `MakePacketType3`. +/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000). +pub fn make_packet_type3(opcode: u8, count: u16) -> u32 { + debug_assert!(count >= 1 && count as u32 <= 0x4000); + (3u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | ((opcode as u32 & 0x7F) << 8) +} + /// Decode a single PM4 packet header. pub fn decode(header: u32) -> PacketHeader { match header >> 30 { @@ -229,4 +250,37 @@ mod tests { assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM"); assert_eq!(type3_opcode_name(0xFE), "UNKNOWN"); } + + #[test] + fn make_packet_helpers_round_trip_through_decode() { + // Type-0: SHADER_CONSTANT_FETCH_00_0 (0x4800), count=6. + let t0 = make_packet_type0(0x4800, 6); + match decode(t0).kind { + PacketKind::Type0 { base_index, count, write_one } => { + assert_eq!(base_index, 0x4800); + assert_eq!(count, 6); + assert!(!write_one); + } + other => panic!("expected Type0, got {other:?}"), + } + assert_eq!(decode(t0).total_dwords, 7); + + // Type-3: PM4_XE_SWAP, count=4 (signature + addr + W + H). + let t3 = make_packet_type3(PM4_XE_SWAP, 4); + match decode(t3).kind { + PacketKind::Type3 { opcode, count, predicated } => { + assert_eq!(opcode, PM4_XE_SWAP); + assert_eq!(count, 4); + assert!(!predicated); + } + other => panic!("expected Type3, got {other:?}"), + } + assert_eq!(decode(t3).total_dwords, 5); + + // Type-2: NOP. + let t2 = make_packet_type2(); + assert_eq!(t2, 0x8000_0000); + assert_eq!(decode(t2).kind, PacketKind::Type2); + assert_eq!(decode(t2).total_dwords, 1); + } } diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs index 1e570a3..a4f34af 100644 --- a/crates/xenia-kernel/src/exports.rs +++ b/crates/xenia-kernel/src/exports.rs @@ -2210,6 +2210,14 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m let ptr = ctx.gpr[3] as u32; let size_log2 = ctx.gpr[4] as u32; state.gpu.initialize_ring_buffer(ptr, size_log2); + // Cache the ring layout on KernelState so `vd_swap` can write PM4 + // packets directly into ring memory at the current WPTR (the GPU + // backend lives on a worker thread under `--gpu-thread` so we can't + // read its `ring.base` from the kernel side without a channel hop). + // Per canary: size_log2 is log2(size in BYTES), so size in dwords = + // 2^size_log2 / 4 = 1 << (size_log2 - 2). + state.ring_base = ptr; + state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 }; ctx.gpr[3] = 0; } @@ -2287,15 +2295,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { } else { (1280, 720) }; - // The guest also writes the virtual frontbuffer address to *frontbuffer_ptr. - // Prefer that when the fetch-derived address is zero. - let frontbuffer_addr = if frontbuffer_virt != 0 { + // Translate frontbuffer virtual → physical. Per canary VdSwap_entry + // (xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:468-471), + // the GPU consumes physical addresses; the fetch header carries a + // virtual address. KRNBUG-Mm-04: our MmGetPhysicalAddress is a masked + // stub; a `virt & 0x1FFF_FFFF` is the equivalent translation today. + let phys_mask: u32 = 0x1FFF_FFFF; + let frontbuffer_addr_virt = if frontbuffer_virt != 0 { frontbuffer_virt } else if frontbuffer_ptr != 0 { mem.read_u32(frontbuffer_ptr) } else { 0 }; + let frontbuffer_addr = frontbuffer_addr_virt & phys_mask; let texture_format = if texture_format_ptr != 0 { mem.read_u32(texture_format_ptr) } else { @@ -2307,37 +2320,87 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { 0 }; - // First-Pixels M2b — two-part commit path. - // - // 1) Fill the guest's reserved 64-dword slot with PM4 Type-2 NOPs - // (0x8000_0000). Some titles consume `buffer_ptr..+256` after - // VdSwap returns and assume they're skippable. Matches the prior - // behaviour. - if buffer_ptr != 0 { - for i in 0..64u32 { - mem.write_u32(buffer_ptr + i * 4, 0x8000_0000); - } + // Patch the fetch header's base_address from virtual to physical so the + // GPU's fetch-constant slot 0 sees the addressable frontbuffer. dword_1 + // high 20 bits hold base_address (>>12). Mirrors canary at + // xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:479. + if frontbuffer_addr != 0 { + let base_phys_shifted = frontbuffer_addr >> 12; + fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | (base_phys_shifted << 12); } - // 2) Advance the ring's write pointer by 64 dwords (the slot the - // game "reserved" via VdSwap's buffer_ptr convention). Despite - // `buffer_ptr` being in the system command buffer rather than the - // primary ring, the 64-dword bump correctly exposes packets the - // game wrote into the primary ring since our last `sync_with_mmio`. - // Empirically (pre-M2b) this path drained 512 packets through 1 B - // guest instructions — the setup packets that D3D9-init writes. + // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write a real PM4 sequence + // into the guest's reserved 64-dword ring slot, then let the natural + // CP drain consume PM4_XE_SWAP. The pre-fix path filled the slot with + // NOPs and called `notify_xe_swap` directly — bypassing the ring, + // leaving the PM4_XE_SWAP handler dead code, and skipping the + // fetch-constant-slot-0 patch that bloom/blur "sample frame N for + // frame N+1" paths depend on. // - // M1.5: bump routes through the shared MMIO atomic so both - // backends produce the same observable WPTR sequence. Inline - // picks it up on its next `sync_with_mmio`; threaded's worker - // observes the same atomic. + // Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521): + // 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0=0x4800, count=6) + 6 dwords + // (the patched fetch header). + // 2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys + // + width + height. + // 3) PM4_TYPE2 NOP fill to slot end (64 dwords total). + let pre_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen; + + // The original M2b path zero-filled buffer_ptr (in the system command + // buffer) and bumped WPTR by 64 to expose the game's own ring writes. + // Keep that untouched — the game still expects buffer_ptr to be a + // skippable scratch area, and the bump still exposes any game-batched + // PM4 packets for the drain. + if buffer_ptr != 0 { + for i in 0..64u32 { + mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2()); + } + } state.gpu.extend_write_ptr_by(64); - // Drain the exposed packets — the D3D9-init setup the game batched - // into the ring plus any leftovers. The synthetic `PM4_XE_SWAP` - // packet the prior code wrote at `buffer_ptr` is **not** written - // anymore; the drain's `ring.base + rptr*4` walk couldn't find it - // anyway (see the pre-M2b `swaps=0 with packets=512` failure mode). + // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write the canary PM4 + // sequence directly into the primary ring at the current WPTR (after + // the buffer_ptr bump above), then advance WPTR over our injection. + // Mirrors xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521): + // 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) + 6 dwords of patched + // fetch header. + // 2) PM4_TYPE3(PM4_XE_SWAP, 4) + signature + frontbuffer_phys + W + H. + // The drain below picks these up via the natural CP path; the + // PM4_TYPE0 patches fetch-constant slot 0 with the frontbuffer's + // physical descriptor (Sylpheed's bloom/blur path samples this for + // frame N+1, so a stale slot 0 gives garbage). + let mut injected_dwords: u32 = 0; + if state.ring_base != 0 && state.ring_size_dwords != 0 { + const SHADER_CONSTANT_FETCH_00_0: u16 = 0x4800; + let packets: [u32; 12] = [ + xenia_gpu::pm4::make_packet_type0(SHADER_CONSTANT_FETCH_00_0, 6), + fetch_dwords[0], + fetch_dwords[1], + fetch_dwords[2], + fetch_dwords[3], + fetch_dwords[4], + fetch_dwords[5], + xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4), + xenia_gpu::pm4::SWAP_SIGNATURE, + frontbuffer_addr, + width, + height, + ]; + // The MMIO WPTR is unmodulo'd; modulo by ring size to get the + // ring offset, then add base + offset*4 for the guest address. + let mmio_wptr = state.gpu.mmio_cp_rb_wptr_load(); + for (i, dword) in packets.iter().enumerate() { + let ofs = (mmio_wptr.wrapping_add(i as u32)) % state.ring_size_dwords; + mem.write_u32(state.ring_base.wrapping_add(ofs.wrapping_mul(4)), *dword); + } + state.gpu.extend_write_ptr_by(packets.len() as u32); + injected_dwords = packets.len() as u32; + } + + // Drain the exposed packets. The PM4_XE_SWAP we just wrote will be + // consumed by the existing handler at gpu_system.rs:1232, which calls + // `notify_xe_swap` itself — bumping `swaps_seen` and recording + // `last_swap` exactly as the old direct call did. Other packets in + // the slot (the fetch-constant patch, NOPs) update GPU state. // // M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`. // Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout @@ -2345,11 +2408,19 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { let drained = state.gpu.drain_to_current_wptr(mem); tracing::debug!(drained, "VdSwap: drained PM4 packets"); - // 3) Fire the swap notification — bumps `swaps_seen`, records - // `last_swap`, enqueues an `InterruptSource::Swap` interrupt for - // the scheduler-round graphics callback path. M1.5: backend-aware; - // threaded sends `NotifyXeSwap` (fire-and-forget). - if frontbuffer_addr != 0 && width > 0 && height > 0 { + // Safety net: if the drain didn't consume PM4_XE_SWAP for any reason + // (ring address arithmetic edge case, threaded-backend timing), fall + // back to the direct notify so swaps don't go to zero. Idempotent — + // only fires when the PM4 path didn't bump the counter. + let post_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen; + if post_swap_counter == pre_swap_counter + && frontbuffer_addr != 0 + && width > 0 + && height > 0 + { + tracing::warn!( + "VdSwap: PM4_XE_SWAP not consumed by drain (drained={drained}); falling back to direct notify" + ); state.gpu.notify_xe_swap(frontbuffer_addr, width, height); } @@ -2539,6 +2610,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { fmt = texture_format, cs = color_space, drained, + injected_dwords, buffer_ptr = format_args!("{buffer_ptr:#010x}"), fetch_ptr = format_args!("{fetch_ptr:#010x}"), "VdSwap complete" diff --git a/crates/xenia-kernel/src/state.rs b/crates/xenia-kernel/src/state.rs index c09dc63..04432f4 100644 --- a/crates/xenia-kernel/src/state.rs +++ b/crates/xenia-kernel/src/state.rs @@ -149,6 +149,12 @@ pub struct KernelState { /// `rtl_raise_exception` only emits once per run, regardless of how /// many subsequent throws fire. Reset on each fresh process start. pub cxx_throw_logged: bool, + /// Cached primary ring base/size, set during `VdInitializeRingBuffer`. + /// Used by `vd_swap` (KRNBUG-Vd-04) so the kernel can write PM4 + /// packets directly into ring memory without going through the GPU + /// backend (which lives on the worker thread under `--gpu-thread`). + pub ring_base: u32, + pub ring_size_dwords: u32, } impl KernelState { @@ -195,6 +201,8 @@ impl KernelState { reservations, thunks_by_ordinal: HashMap::new(), cxx_throw_logged: false, + ring_base: 0, + ring_size_dwords: 0, }; crate::exports::register_exports(&mut state); crate::xam::register_exports(&mut state);