Merge audit-2026-05-fix/renderer-p0-vdswap-pm4: VdSwap PM4 ring path
This commit is contained in:
@@ -358,6 +358,14 @@ impl GpuBackend {
|
|||||||
.store(cur.wrapping_add(dwords), Ordering::Release);
|
.store(cur.wrapping_add(dwords), Ordering::Release);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Read the current ring write pointer from the shared MMIO atomic.
|
||||||
|
/// Acquire-load so any prior ring-memory writes the caller did before
|
||||||
|
/// asking are not reordered past this read by the compiler. Used by
|
||||||
|
/// `vd_swap` to compute the ring offset for direct PM4 injection.
|
||||||
|
pub fn mmio_cp_rb_wptr_load(&self) -> u32 {
|
||||||
|
self.mmio().cp_rb_wptr.load(Ordering::Acquire)
|
||||||
|
}
|
||||||
|
|
||||||
/// Drain any PM4 packets currently exposed by the ring (i.e., up to
|
/// Drain any PM4 packets currently exposed by the ring (i.e., up to
|
||||||
/// the current `CP_RB_WPTR`). Inline mode runs the synchronous
|
/// the current `CP_RB_WPTR`). Inline mode runs the synchronous
|
||||||
/// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks
|
/// drain. Threaded mode posts a [`GpuCommand::DrainFence`] and blocks
|
||||||
|
|||||||
@@ -128,6 +128,27 @@ pub enum PacketKind {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Build a Type-0 register-write packet header. Mirrors canary's
|
||||||
|
/// `MakePacketType0` at `xenia-canary/src/xenia/gpu/xenos.h:1682`.
|
||||||
|
/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
|
||||||
|
pub fn make_packet_type0(reg_index: u16, count: u16) -> u32 {
|
||||||
|
debug_assert!(reg_index <= 0x7FFF);
|
||||||
|
debug_assert!(count >= 1 && count as u32 <= 0x4000);
|
||||||
|
(0u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | (reg_index as u32 & 0x7FFF)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a Type-2 NOP packet header. Single dword, no payload.
|
||||||
|
pub const fn make_packet_type2() -> u32 {
|
||||||
|
2u32 << 30
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a Type-3 command packet header. Mirrors canary's `MakePacketType3`.
|
||||||
|
/// `count` is the number of data dwords that follow (inclusive: 1..=0x4000).
|
||||||
|
pub fn make_packet_type3(opcode: u8, count: u16) -> u32 {
|
||||||
|
debug_assert!(count >= 1 && count as u32 <= 0x4000);
|
||||||
|
(3u32 << 30) | (((count as u32 - 1) & 0x3FFF) << 16) | ((opcode as u32 & 0x7F) << 8)
|
||||||
|
}
|
||||||
|
|
||||||
/// Decode a single PM4 packet header.
|
/// Decode a single PM4 packet header.
|
||||||
pub fn decode(header: u32) -> PacketHeader {
|
pub fn decode(header: u32) -> PacketHeader {
|
||||||
match header >> 30 {
|
match header >> 30 {
|
||||||
@@ -229,4 +250,37 @@ mod tests {
|
|||||||
assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
|
assert_eq!(type3_opcode_name(PM4_WAIT_REG_MEM), "WAIT_REG_MEM");
|
||||||
assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
|
assert_eq!(type3_opcode_name(0xFE), "UNKNOWN");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn make_packet_helpers_round_trip_through_decode() {
|
||||||
|
// Type-0: SHADER_CONSTANT_FETCH_00_0 (0x4800), count=6.
|
||||||
|
let t0 = make_packet_type0(0x4800, 6);
|
||||||
|
match decode(t0).kind {
|
||||||
|
PacketKind::Type0 { base_index, count, write_one } => {
|
||||||
|
assert_eq!(base_index, 0x4800);
|
||||||
|
assert_eq!(count, 6);
|
||||||
|
assert!(!write_one);
|
||||||
|
}
|
||||||
|
other => panic!("expected Type0, got {other:?}"),
|
||||||
|
}
|
||||||
|
assert_eq!(decode(t0).total_dwords, 7);
|
||||||
|
|
||||||
|
// Type-3: PM4_XE_SWAP, count=4 (signature + addr + W + H).
|
||||||
|
let t3 = make_packet_type3(PM4_XE_SWAP, 4);
|
||||||
|
match decode(t3).kind {
|
||||||
|
PacketKind::Type3 { opcode, count, predicated } => {
|
||||||
|
assert_eq!(opcode, PM4_XE_SWAP);
|
||||||
|
assert_eq!(count, 4);
|
||||||
|
assert!(!predicated);
|
||||||
|
}
|
||||||
|
other => panic!("expected Type3, got {other:?}"),
|
||||||
|
}
|
||||||
|
assert_eq!(decode(t3).total_dwords, 5);
|
||||||
|
|
||||||
|
// Type-2: NOP.
|
||||||
|
let t2 = make_packet_type2();
|
||||||
|
assert_eq!(t2, 0x8000_0000);
|
||||||
|
assert_eq!(decode(t2).kind, PacketKind::Type2);
|
||||||
|
assert_eq!(decode(t2).total_dwords, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2210,6 +2210,14 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
|
|||||||
let ptr = ctx.gpr[3] as u32;
|
let ptr = ctx.gpr[3] as u32;
|
||||||
let size_log2 = ctx.gpr[4] as u32;
|
let size_log2 = ctx.gpr[4] as u32;
|
||||||
state.gpu.initialize_ring_buffer(ptr, size_log2);
|
state.gpu.initialize_ring_buffer(ptr, size_log2);
|
||||||
|
// Cache the ring layout on KernelState so `vd_swap` can write PM4
|
||||||
|
// packets directly into ring memory at the current WPTR (the GPU
|
||||||
|
// backend lives on a worker thread under `--gpu-thread` so we can't
|
||||||
|
// read its `ring.base` from the kernel side without a channel hop).
|
||||||
|
// Per canary: size_log2 is log2(size in BYTES), so size in dwords =
|
||||||
|
// 2^size_log2 / 4 = 1 << (size_log2 - 2).
|
||||||
|
state.ring_base = ptr;
|
||||||
|
state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
|
||||||
ctx.gpr[3] = 0;
|
ctx.gpr[3] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2287,15 +2295,20 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
|||||||
} else {
|
} else {
|
||||||
(1280, 720)
|
(1280, 720)
|
||||||
};
|
};
|
||||||
// The guest also writes the virtual frontbuffer address to *frontbuffer_ptr.
|
// Translate frontbuffer virtual → physical. Per canary VdSwap_entry
|
||||||
// Prefer that when the fetch-derived address is zero.
|
// (xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:468-471),
|
||||||
let frontbuffer_addr = if frontbuffer_virt != 0 {
|
// the GPU consumes physical addresses; the fetch header carries a
|
||||||
|
// virtual address. KRNBUG-Mm-04: our MmGetPhysicalAddress is a masked
|
||||||
|
// stub; a `virt & 0x1FFF_FFFF` is the equivalent translation today.
|
||||||
|
let phys_mask: u32 = 0x1FFF_FFFF;
|
||||||
|
let frontbuffer_addr_virt = if frontbuffer_virt != 0 {
|
||||||
frontbuffer_virt
|
frontbuffer_virt
|
||||||
} else if frontbuffer_ptr != 0 {
|
} else if frontbuffer_ptr != 0 {
|
||||||
mem.read_u32(frontbuffer_ptr)
|
mem.read_u32(frontbuffer_ptr)
|
||||||
} else {
|
} else {
|
||||||
0
|
0
|
||||||
};
|
};
|
||||||
|
let frontbuffer_addr = frontbuffer_addr_virt & phys_mask;
|
||||||
let texture_format = if texture_format_ptr != 0 {
|
let texture_format = if texture_format_ptr != 0 {
|
||||||
mem.read_u32(texture_format_ptr)
|
mem.read_u32(texture_format_ptr)
|
||||||
} else {
|
} else {
|
||||||
@@ -2307,37 +2320,87 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
|||||||
0
|
0
|
||||||
};
|
};
|
||||||
|
|
||||||
// First-Pixels M2b — two-part commit path.
|
// Patch the fetch header's base_address from virtual to physical so the
|
||||||
//
|
// GPU's fetch-constant slot 0 sees the addressable frontbuffer. dword_1
|
||||||
// 1) Fill the guest's reserved 64-dword slot with PM4 Type-2 NOPs
|
// high 20 bits hold base_address (>>12). Mirrors canary at
|
||||||
// (0x8000_0000). Some titles consume `buffer_ptr..+256` after
|
// xenia-canary/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc:479.
|
||||||
// VdSwap returns and assume they're skippable. Matches the prior
|
if frontbuffer_addr != 0 {
|
||||||
// behaviour.
|
let base_phys_shifted = frontbuffer_addr >> 12;
|
||||||
if buffer_ptr != 0 {
|
fetch_dwords[1] = (fetch_dwords[1] & 0x0000_0FFF) | (base_phys_shifted << 12);
|
||||||
for i in 0..64u32 {
|
|
||||||
mem.write_u32(buffer_ptr + i * 4, 0x8000_0000);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) Advance the ring's write pointer by 64 dwords (the slot the
|
// KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write a real PM4 sequence
|
||||||
// game "reserved" via VdSwap's buffer_ptr convention). Despite
|
// into the guest's reserved 64-dword ring slot, then let the natural
|
||||||
// `buffer_ptr` being in the system command buffer rather than the
|
// CP drain consume PM4_XE_SWAP. The pre-fix path filled the slot with
|
||||||
// primary ring, the 64-dword bump correctly exposes packets the
|
// NOPs and called `notify_xe_swap` directly — bypassing the ring,
|
||||||
// game wrote into the primary ring since our last `sync_with_mmio`.
|
// leaving the PM4_XE_SWAP handler dead code, and skipping the
|
||||||
// Empirically (pre-M2b) this path drained 512 packets through 1 B
|
// fetch-constant-slot-0 patch that bloom/blur "sample frame N for
|
||||||
// guest instructions — the setup packets that D3D9-init writes.
|
// frame N+1" paths depend on.
|
||||||
//
|
//
|
||||||
// M1.5: bump routes through the shared MMIO atomic so both
|
// Sequence per xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
|
||||||
// backends produce the same observable WPTR sequence. Inline
|
// 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0=0x4800, count=6) + 6 dwords
|
||||||
// picks it up on its next `sync_with_mmio`; threaded's worker
|
// (the patched fetch header).
|
||||||
// observes the same atomic.
|
// 2) PM4_TYPE3(PM4_XE_SWAP, count=4) + signature + frontbuffer_phys
|
||||||
|
// + width + height.
|
||||||
|
// 3) PM4_TYPE2 NOP fill to slot end (64 dwords total).
|
||||||
|
let pre_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
|
||||||
|
|
||||||
|
// The original M2b path zero-filled buffer_ptr (in the system command
|
||||||
|
// buffer) and bumped WPTR by 64 to expose the game's own ring writes.
|
||||||
|
// Keep that untouched — the game still expects buffer_ptr to be a
|
||||||
|
// skippable scratch area, and the bump still exposes any game-batched
|
||||||
|
// PM4 packets for the drain.
|
||||||
|
if buffer_ptr != 0 {
|
||||||
|
for i in 0..64u32 {
|
||||||
|
mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
|
||||||
|
}
|
||||||
|
}
|
||||||
state.gpu.extend_write_ptr_by(64);
|
state.gpu.extend_write_ptr_by(64);
|
||||||
|
|
||||||
// Drain the exposed packets — the D3D9-init setup the game batched
|
// KRNBUG-Vd-04 / GPUBUG-001 / XMODBUG-013: write the canary PM4
|
||||||
// into the ring plus any leftovers. The synthetic `PM4_XE_SWAP`
|
// sequence directly into the primary ring at the current WPTR (after
|
||||||
// packet the prior code wrote at `buffer_ptr` is **not** written
|
// the buffer_ptr bump above), then advance WPTR over our injection.
|
||||||
// anymore; the drain's `ring.base + rptr*4` walk couldn't find it
|
// Mirrors xenia-canary VdSwap_entry (xboxkrnl_video.cc:438-521):
|
||||||
// anyway (see the pre-M2b `swaps=0 with packets=512` failure mode).
|
// 1) PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0, 6) + 6 dwords of patched
|
||||||
|
// fetch header.
|
||||||
|
// 2) PM4_TYPE3(PM4_XE_SWAP, 4) + signature + frontbuffer_phys + W + H.
|
||||||
|
// The drain below picks these up via the natural CP path; the
|
||||||
|
// PM4_TYPE0 patches fetch-constant slot 0 with the frontbuffer's
|
||||||
|
// physical descriptor (Sylpheed's bloom/blur path samples this for
|
||||||
|
// frame N+1, so a stale slot 0 gives garbage).
|
||||||
|
let mut injected_dwords: u32 = 0;
|
||||||
|
if state.ring_base != 0 && state.ring_size_dwords != 0 {
|
||||||
|
const SHADER_CONSTANT_FETCH_00_0: u16 = 0x4800;
|
||||||
|
let packets: [u32; 12] = [
|
||||||
|
xenia_gpu::pm4::make_packet_type0(SHADER_CONSTANT_FETCH_00_0, 6),
|
||||||
|
fetch_dwords[0],
|
||||||
|
fetch_dwords[1],
|
||||||
|
fetch_dwords[2],
|
||||||
|
fetch_dwords[3],
|
||||||
|
fetch_dwords[4],
|
||||||
|
fetch_dwords[5],
|
||||||
|
xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
|
||||||
|
xenia_gpu::pm4::SWAP_SIGNATURE,
|
||||||
|
frontbuffer_addr,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
];
|
||||||
|
// The MMIO WPTR is unmodulo'd; modulo by ring size to get the
|
||||||
|
// ring offset, then add base + offset*4 for the guest address.
|
||||||
|
let mmio_wptr = state.gpu.mmio_cp_rb_wptr_load();
|
||||||
|
for (i, dword) in packets.iter().enumerate() {
|
||||||
|
let ofs = (mmio_wptr.wrapping_add(i as u32)) % state.ring_size_dwords;
|
||||||
|
mem.write_u32(state.ring_base.wrapping_add(ofs.wrapping_mul(4)), *dword);
|
||||||
|
}
|
||||||
|
state.gpu.extend_write_ptr_by(packets.len() as u32);
|
||||||
|
injected_dwords = packets.len() as u32;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drain the exposed packets. The PM4_XE_SWAP we just wrote will be
|
||||||
|
// consumed by the existing handler at gpu_system.rs:1232, which calls
|
||||||
|
// `notify_xe_swap` itself — bumping `swaps_seen` and recording
|
||||||
|
// `last_swap` exactly as the old direct call did. Other packets in
|
||||||
|
// the slot (the fetch-constant patch, NOPs) update GPU state.
|
||||||
//
|
//
|
||||||
// M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`.
|
// M1.5: backend-aware drain. Inline: synchronous `sync_with_mmio + drain`.
|
||||||
// Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout
|
// Threaded: posts `DrainFence` + blocks on reply (1 s defensive timeout
|
||||||
@@ -2345,11 +2408,19 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
|||||||
let drained = state.gpu.drain_to_current_wptr(mem);
|
let drained = state.gpu.drain_to_current_wptr(mem);
|
||||||
tracing::debug!(drained, "VdSwap: drained PM4 packets");
|
tracing::debug!(drained, "VdSwap: drained PM4 packets");
|
||||||
|
|
||||||
// 3) Fire the swap notification — bumps `swaps_seen`, records
|
// Safety net: if the drain didn't consume PM4_XE_SWAP for any reason
|
||||||
// `last_swap`, enqueues an `InterruptSource::Swap` interrupt for
|
// (ring address arithmetic edge case, threaded-backend timing), fall
|
||||||
// the scheduler-round graphics callback path. M1.5: backend-aware;
|
// back to the direct notify so swaps don't go to zero. Idempotent —
|
||||||
// threaded sends `NotifyXeSwap` (fire-and-forget).
|
// only fires when the PM4 path didn't bump the counter.
|
||||||
if frontbuffer_addr != 0 && width > 0 && height > 0 {
|
let post_swap_counter = state.gpu.digest_snapshot().stats.swaps_seen;
|
||||||
|
if post_swap_counter == pre_swap_counter
|
||||||
|
&& frontbuffer_addr != 0
|
||||||
|
&& width > 0
|
||||||
|
&& height > 0
|
||||||
|
{
|
||||||
|
tracing::warn!(
|
||||||
|
"VdSwap: PM4_XE_SWAP not consumed by drain (drained={drained}); falling back to direct notify"
|
||||||
|
);
|
||||||
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2539,6 +2610,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
|||||||
fmt = texture_format,
|
fmt = texture_format,
|
||||||
cs = color_space,
|
cs = color_space,
|
||||||
drained,
|
drained,
|
||||||
|
injected_dwords,
|
||||||
buffer_ptr = format_args!("{buffer_ptr:#010x}"),
|
buffer_ptr = format_args!("{buffer_ptr:#010x}"),
|
||||||
fetch_ptr = format_args!("{fetch_ptr:#010x}"),
|
fetch_ptr = format_args!("{fetch_ptr:#010x}"),
|
||||||
"VdSwap complete"
|
"VdSwap complete"
|
||||||
|
|||||||
@@ -149,6 +149,12 @@ pub struct KernelState {
|
|||||||
/// `rtl_raise_exception` only emits once per run, regardless of how
|
/// `rtl_raise_exception` only emits once per run, regardless of how
|
||||||
/// many subsequent throws fire. Reset on each fresh process start.
|
/// many subsequent throws fire. Reset on each fresh process start.
|
||||||
pub cxx_throw_logged: bool,
|
pub cxx_throw_logged: bool,
|
||||||
|
/// Cached primary ring base/size, set during `VdInitializeRingBuffer`.
|
||||||
|
/// Used by `vd_swap` (KRNBUG-Vd-04) so the kernel can write PM4
|
||||||
|
/// packets directly into ring memory without going through the GPU
|
||||||
|
/// backend (which lives on the worker thread under `--gpu-thread`).
|
||||||
|
pub ring_base: u32,
|
||||||
|
pub ring_size_dwords: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl KernelState {
|
impl KernelState {
|
||||||
@@ -195,6 +201,8 @@ impl KernelState {
|
|||||||
reservations,
|
reservations,
|
||||||
thunks_by_ordinal: HashMap::new(),
|
thunks_by_ordinal: HashMap::new(),
|
||||||
cxx_throw_logged: false,
|
cxx_throw_logged: false,
|
||||||
|
ring_base: 0,
|
||||||
|
ring_size_dwords: 0,
|
||||||
};
|
};
|
||||||
crate::exports::register_exports(&mut state);
|
crate::exports::register_exports(&mut state);
|
||||||
crate::xam::register_exports(&mut state);
|
crate::xam::register_exports(&mut state);
|
||||||
|
|||||||
Reference in New Issue
Block a user