Merge audit-2026-05-fix/renderer-p0-vdswap-pm4: VdSwap PM4 ring path

This commit is contained in:
MechaCat02
2026-05-03 14:00:27 +02:00
4 changed files with 176 additions and 34 deletions

View File

@@ -358,6 +358,14 @@ impl GpuBackend {
.store(cur.wrapping_add(dwords), Ordering::Release); .store(cur.wrapping_add(dwords), Ordering::Release);
} }
/// Read the current ring write pointer from the shared MMIO atomic.
/// Acquire-load so any prior ring-memory writes the caller did before
/// asking are not reordered past this read by the compiler. Used by
/// `vd_swap` to compute the ring offset for direct PM4 injection.
pub fn mmio_cp_rb_wptr_load(&self) -> u32 {
self.mmio().cp_rb_wptr.load(Ordering::Acquire)
}
/// Drain any PM4 packets currently exposed by the ring (i.e., up to /// Drain any PM4 packets currently exposed by the ring (i.e., up to
/// the current `CP_RB_WPTR`). Inline mode runs the synchronous /// the current `CP_RB_WPTR`). Inline mode runs the synchronous
/// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks /// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks

View File

@@ -128,6 +128,27 @@ pub enum PacketKind {
}, },
} }
/// Build a Type-0 register-write packet header. Mirrors canary's
/// `MakePacketType0` at `xenia-canary/src/xenia/gpu/xenos.h:1682`.
/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
pub fn make_packet_type0(reg_index: u16, count: u16) -> u32 {
debug_assert!(reg_index <= 0x7FFF);
debug_assert!(count >= 1 && count as u32 <= 0x4000);
(0u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | (reg_index as u32 & 0x7FFF)
}
/// Build a Type-2 NOP packet header. Single dword, no payload.
pub const fn make_packet_type2() -> u32 {
2u32 << 30
}
/// Build a Type-3 command packet header. Mirrors canary's `MakePacketType3`.
/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
pub fn make_packet_type3(opcode: u8, count: u16) -> u32 {
debug_assert!(count >= 1 && count as u32 <= 0x4000);
(3u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | ((opcode as u32 & 0x7F) << 8)
}
/// Decode a single PM4 packet header. /// Decode a single PM4 packet header.
pub fn decode(header: u32) -> PacketHeader { pub fn decode(header: u32) -> PacketHeader {
match header >> 30 { match header >> 30 {
@@ -229,4 +250,37 @@ mod tests {
assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM"); assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
assert_eq!(type3_opcode_name(0xFE), "UNKNOWN"); assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
} }
#[test]
fn make_packet_helpers_round_trip_through_decode() {
// Type-0: SHADER_CONSTANT_FETCH_00_0 (0x4800), count=6.
let t0 = make_packet_type0(0x4800, 6);
match decode(t0).kind {
PacketKind::Type0 { base_index, count, write_one } => {
assert_eq!(base_index, 0x4800);
assert_eq!(count, 6);
assert!(!write_one);
}
other => panic!("expected Type0, got {other:?}"),
}
assert_eq!(decode(t0).total_dwords, 7);
// Type-3: PM4_XE_SWAP, count=4 (signature + addr + W + H).
let t3 = make_packet_type3(PM4_XE_SWAP, 4);
match decode(t3).kind {
PacketKind::Type3 { opcode, count, predicated } => {
assert_eq!(opcode, PM4_XE_SWAP);
assert_eq!(count, 4);
assert!(!predicated);
}
other => panic!("expected Type3, got {other:?}"),
}
assert_eq!(decode(t3).total_dwords, 5);
// Type-2: NOP.
let t2 = make_packet_type2();
assert_eq!(t2, 0x8000_0000);
assert_eq!(decode(t2).kind, PacketKind::Type2);
assert_eq!(decode(t2).total_dwords, 1);
}
} }

View File

@@ -2210,6 +2210,14 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
let ptr = ctx.gpr[3] as u32; let ptr = ctx.gpr[3] as u32;
let size_log2 = ctx.gpr[4] as u32; let size_log2 = ctx.gpr[4] as u32;
state.gpu.initialize_ring_buffer(ptr, size_log2); state.gpu.initialize_ring_buffer(ptr, size_log2);
// Cache the ring layout on KernelState so `vd_swap` can write PM4
// packets directly into ring memory at the current WPTR (the GPU
// backend lives on a worker thread under `--gpu-thread` so we can't
// read its `ring.base` from the kernel side without a channel hop).
// Per canary: size_log2 is log2(size in BYTES), so size in dwords =
// 2^size_log2 / 4 = 1 << (size_log2 - 2).
state.ring_base = ptr;
state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
ctx.gpr[3] = 0; ctx.gpr[3] = 0;
} }
@@ -2287,15 +2295,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
} else { } else {
(1280, 720) (1280, 720)
}; };
// The guest also writes the virtual frontbuffer address to *frontbuffer_ptr. // Translate frontbuffer virtual → physical. Per canary VdSwap_entry
// Prefer that when the fetch-derived address is zero. // (xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:468-471),
let frontbuffer_addr = if frontbuffer_virt != 0 { // the GPU consumes physical addresses; the fetch header carries a
// virtual address. KRNBUG-Mm-04: our MmGetPhysicalAddress is a masked
// stub; a `virt & 0x1FFF_FFFF` is the equivalent translation today.
let phys_mask: u32 = 0x1FFF_FFFF;
let frontbuffer_addr_virt = if frontbuffer_virt != 0 {
frontbuffer_virt frontbuffer_virt
} else if frontbuffer_ptr != 0 { } else if frontbuffer_ptr != 0 {
mem.read_u32(frontbuffer_ptr) mem.read_u32(frontbuffer_ptr)
} else { } else {
0 0
}; };
let frontbuffer_addr = frontbuffer_addr_virt & phys_mask;
let texture_format = if texture_format_ptr != 0 { let texture_format = if texture_format_ptr != 0 {
mem.read_u32(texture_format_ptr) mem.read_u32(texture_format_ptr)
} else { } else {
@@ -2307,37 +2320,87 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
0 0
}; };
// First-Pixels M2b — two-part commit path. // Patch the fetch header's base_address from virtual to physical so the
// // GPU's fetch-constant slot 0 sees the addressable frontbuffer. dword_1
// 1) Fill the guest's reserved 64-dword slot with PM4 Type-2 NOPs // high 20 bits hold base_address (>>12). Mirrors canary at
// (0x8000_0000). Some titles consume `buffer_ptr..+256` after // xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:479.
// VdSwap returns and assume they're skippable. Matches the prior if frontbuffer_addr != 0 {
// behaviour. let base_phys_shifted = frontbuffer_addr >> 12;
if buffer_ptr != 0 { fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | (base_phys_shifted << 12);
for i in 0..64u32 {
mem.write_u32(buffer_ptr + i * 4, 0x8000_0000);
}
} }
// 2) Advance the ring's write pointer by 64 dwords (the slot the // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write a real PM4 sequence
// game "reserved" via VdSwap's buffer_ptr convention). Despite // into the guest's reserved 64-dword ring slot, then let the natural
// `buffer_ptr` being in the system command buffer rather than the // CP drain consume PM4_XE_SWAP. The pre-fix path filled the slot with
// primary ring, the 64-dword bump correctly exposes packets the // NOPs and called `notify_xe_swap` directly — bypassing the ring,
// game wrote into the primary ring since our last `sync_with_mmio`. // leaving the PM4_XE_SWAP handler dead code, and skipping the
// Empirically (pre-M2b) this path drained 512 packets through 1 B // fetch-constant-slot-0 patch that bloom/blur "sample frame N for
// guest instructions — the setup packets that D3D9-init writes. // frame N+1" paths depend on.
// //
// M1.5: bump routes through the shared MMIO atomic so both // Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
// backends produce the same observable WPTR sequence. Inline // 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0=0x4800, count=6) + 6 dwords
// picks it up on its next `sync_with_mmio`; threaded's worker // (the patched fetch header).
// observes the same atomic. // 2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys
// + width + height.
// 3) PM4_TYPE2 NOP fill to slot end (64 dwords total).
let pre_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
// The original M2b path zero-filled buffer_ptr (in the system command
// buffer) and bumped WPTR by 64 to expose the game's own ring writes.
// Keep that untouched — the game still expects buffer_ptr to be a
// skippable scratch area, and the bump still exposes any game-batched
// PM4 packets for the drain.
if buffer_ptr != 0 {
for i in 0..64u32 {
mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
}
}
state.gpu.extend_write_ptr_by(64); state.gpu.extend_write_ptr_by(64);
// Drain the exposed packets — the D3D9-init setup the game batched // KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write the canary PM4
// into the ring plus any leftovers. The synthetic `PM4_XE_SWAP` // sequence directly into the primary ring at the current WPTR (after
// packet the prior code wrote at `buffer_ptr` is **not** written // the buffer_ptr bump above), then advance WPTR over our injection.
// anymore; the drain's `ring.base + rptr*4` walk couldn't find it // Mirrors xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
// anyway (see the pre-M2b `swaps=0 with packets=512` failure mode). // 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) + 6 dwords of patched
// fetch header.
// 2) PM4_TYPE3(PM4_XE_SWAP, 4) + signature + frontbuffer_phys + W + H.
// The drain below picks these up via the natural CP path; the
// PM4_TYPE0 patches fetch-constant slot 0 with the frontbuffer's
// physical descriptor (Sylpheed's bloom/blur path samples this for
// frame N+1, so a stale slot 0 gives garbage).
let mut injected_dwords: u32 = 0;
if state.ring_base != 0 && state.ring_size_dwords != 0 {
const SHADER_CONSTANT_FETCH_00_0: u16 = 0x4800;
let packets: [u32; 12] = [
xenia_gpu::pm4::make_packet_type0(SHADER_CONSTANT_FETCH_00_0, 6),
fetch_dwords[0],
fetch_dwords[1],
fetch_dwords[2],
fetch_dwords[3],
fetch_dwords[4],
fetch_dwords[5],
xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
xenia_gpu::pm4::SWAP_SIGNATURE,
frontbuffer_addr,
width,
height,
];
// The MMIO WPTR is unmodulo'd; modulo by ring size to get the
// ring offset, then add base + offset*4 for the guest address.
let mmio_wptr = state.gpu.mmio_cp_rb_wptr_load();
for (i, dword) in packets.iter().enumerate() {
let ofs = (mmio_wptr.wrapping_add(i as u32)) % state.ring_size_dwords;
mem.write_u32(state.ring_base.wrapping_add(ofs.wrapping_mul(4)), *dword);
}
state.gpu.extend_write_ptr_by(packets.len() as u32);
injected_dwords = packets.len() as u32;
}
// Drain the exposed packets. The PM4_XE_SWAP we just wrote will be
// consumed by the existing handler at gpu_system.rs:1232, which calls
// `notify_xe_swap` itself — bumping `swaps_seen` and recording
// `last_swap` exactly as the old direct call did. Other packets in
// the slot (the fetch-constant patch, NOPs) update GPU state.
// //
// M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`. // M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`.
// Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout // Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout
@@ -2345,11 +2408,19 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
let drained = state.gpu.drain_to_current_wptr(mem); let drained = state.gpu.drain_to_current_wptr(mem);
tracing::debug!(drained, "VdSwap: drained PM4 packets"); tracing::debug!(drained, "VdSwap: drained PM4 packets");
// 3) Fire the swap notification — bumps `swaps_seen`, records // Safety net: if the drain didn't consume PM4_XE_SWAP for any reason
// `last_swap`, enqueues an `InterruptSource::Swap` interrupt for // (ring address arithmetic edge case, threaded-backend timing), fall
// the scheduler-round graphics callback path. M1.5: backend-aware; // back to the direct notify so swaps don't go to zero. Idempotent —
// threaded sends `NotifyXeSwap` (fire-and-forget). // only fires when the PM4 path didn't bump the counter.
if frontbuffer_addr != 0 && width > 0 && height > 0 { let post_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
if post_swap_counter == pre_swap_counter
&& frontbuffer_addr != 0
&& width > 0
&& height > 0
{
tracing::warn!(
"VdSwap: PM4_XE_SWAP not consumed by drain (drained={drained}); falling back to direct notify"
);
state.gpu.notify_xe_swap(frontbuffer_addr, width, height); state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
} }
@@ -2539,6 +2610,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
fmt = texture_format, fmt = texture_format,
cs = color_space, cs = color_space,
drained, drained,
injected_dwords,
buffer_ptr = format_args!("{buffer_ptr:#010x}"), buffer_ptr = format_args!("{buffer_ptr:#010x}"),
fetch_ptr = format_args!("{fetch_ptr:#010x}"), fetch_ptr = format_args!("{fetch_ptr:#010x}"),
"VdSwap complete" "VdSwap complete"

View File

@@ -149,6 +149,12 @@ pub struct KernelState {
/// `rtl_raise_exception` only emits once per run, regardless of how /// `rtl_raise_exception` only emits once per run, regardless of how
/// many subsequent throws fire. Reset on each fresh process start. /// many subsequent throws fire. Reset on each fresh process start.
pub cxx_throw_logged: bool, pub cxx_throw_logged: bool,
/// Cached primary ring base/size, set during `VdInitializeRingBuffer`.
/// Used by `vd_swap` (KRNBUG-Vd-04) so the kernel can write PM4
/// packets directly into ring memory without going through the GPU
/// backend (which lives on the worker thread under `--gpu-thread`).
pub ring_base: u32,
pub ring_size_dwords: u32,
} }
impl KernelState { impl KernelState {
@@ -195,6 +201,8 @@ impl KernelState {
reservations, reservations,
thunks_by_ordinal: HashMap::new(), thunks_by_ordinal: HashMap::new(),
cxx_throw_logged: false, cxx_throw_logged: false,
ring_base: 0,
ring_size_dwords: 0,
}; };
crate::exports::register_exports(&mut state); crate::exports::register_exports(&mut state);
crate::xam::register_exports(&mut state); crate::xam::register_exports(&mut state);