Compare commits
10 Commits
iterate-2K
...
iterate-3M
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6bb4355e3d | ||
|
|
3f5d5cf5f7 | ||
|
|
2f55d1fd7d | ||
|
|
a91f4c550b | ||
|
|
66bd805726 | ||
|
|
ad9c8e4cb8 | ||
|
|
873c197ff1 | ||
|
|
1ae472bd2b | ||
|
|
034ec8b47f | ||
|
|
93f60a3ba0 |
@@ -1540,8 +1540,19 @@ fn cmd_exec_inner(
|
||||
mem.write_u32(addr, block);
|
||||
}
|
||||
("xboxkrnl.exe", 0x01BE) => {
|
||||
// VdGlobalDevice — passed through to Vd* shims. Write 0.
|
||||
mem.write_u32(addr, 0);
|
||||
// VdGlobalDevice — a *pointer to* a global D3D-device cell.
|
||||
// Mirror xenia-canary RegisterVideoExports (xboxkrnl_video.cc:
|
||||
// 557-564): allocate a 4-byte cell, point the import slot at
|
||||
// it, and zero the cell. The guest's graphics init then stores
|
||||
// its device object INTO the cell (e.g. sub_824C6DC0 @
|
||||
// 0x824C6F18 `stw r31, 0([0x82000750])`), and the swap-complete
|
||||
// callback sub_824CE2B8 reads it back via the two-level
|
||||
// `[[VdGlobalDevice]+0]+15160` to bump the swap counter (clock
|
||||
// B). Writing 0 directly here (the old behaviour) made that
|
||||
// store land at address 0 and the swap counter never advance —
|
||||
// freezing the title-loop's per-frame manager update.
|
||||
let cell = alloc_zero(0x4, &mut mem, &mut kernel);
|
||||
mem.write_u32(addr, cell);
|
||||
}
|
||||
("xboxkrnl.exe", 0x01C0) => {
|
||||
// VdGpuClockInMHz
|
||||
@@ -2327,10 +2338,22 @@ fn coord_post_round(
|
||||
}
|
||||
|
||||
if kernel.gpu.has_pending_interrupts() {
|
||||
for _pi in kernel.gpu.take_pending_interrupts() {
|
||||
for pi in kernel.gpu.take_pending_interrupts() {
|
||||
// Canary `ExecutePacketType3_INTERRUPT` dispatches the callback
|
||||
// once per set bit of `cpu_mask` with that bit's index as the
|
||||
// target CPU (`DispatchInterruptCallback(1, n)`). The guest's
|
||||
// swap-acknowledge fence stores `cpu_mask`, and the ISR clears
|
||||
// `1 << current_cpu` from it — so the ISR must run impersonating
|
||||
// the masked CPU or the fence never reaches 0. Sylpheed uses a
|
||||
// single-bit mask (`0x4` → CPU 2); take the lowest set bit.
|
||||
let cpu = if pi.cpu_mask == 0 {
|
||||
xenia_kernel::interrupts::VSYNC_TARGET_CPU
|
||||
} else {
|
||||
pi.cpu_mask.trailing_zeros().min(5) as u8
|
||||
};
|
||||
kernel
|
||||
.interrupts
|
||||
.queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP);
|
||||
.queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP, cpu);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3534,7 +3557,17 @@ fn dispatch_graphics_interrupts(
|
||||
None
|
||||
};
|
||||
|
||||
/// X_KPCR offset of `prcb_data.current_cpu` (canary `xthread.cc`
|
||||
/// `SetActiveCpu` → `pcr.prcb_data.current_cpu`). The guest graphics
|
||||
/// ISR reads it via `lbz r10, 268(r13)` to decide which per-CPU bit of
|
||||
/// the swap-acknowledge fence to clear.
|
||||
const PCR_CURRENT_CPU_OFF: u32 = 268;
|
||||
|
||||
while let Some(source) = kernel.interrupts.peek_next() {
|
||||
let target_cpu = kernel
|
||||
.interrupts
|
||||
.peek_next_cpu()
|
||||
.unwrap_or(xenia_kernel::interrupts::VSYNC_TARGET_CPU);
|
||||
// Victim selection: Ready first, then Blocked (canary's
|
||||
// `XThread::GetCurrentThread()` analog — any live thread will
|
||||
// do for borrowing context). Skip Idle/Exited/ServicingIrq.
|
||||
@@ -3604,6 +3637,19 @@ fn dispatch_graphics_interrupts(
|
||||
saved
|
||||
};
|
||||
|
||||
// Impersonate the interrupt's target CPU on the borrowed thread's
|
||||
// PCR, mirroring canary `EmulateCPInterruptDPC` →
|
||||
// `XThread::SetActiveCpu(cpu)`. The guest swap-complete ISR clears
|
||||
// `1 << [pcr.current_cpu]` from the per-present swap-acknowledge
|
||||
// fence; if it runs on the wrong CPU it clears the wrong bit and
|
||||
// the GPU's trailing `WAIT_REG_MEM` on that fence never releases —
|
||||
// stranding the present/title loop. Save/restore so borrowing a
|
||||
// thread doesn't permanently rewrite its processor number.
|
||||
let pcr_addr = (kernel.scheduler.ctx_mut_ref(target_ref).gpr[13] as u32)
|
||||
.wrapping_add(PCR_CURRENT_CPU_OFF);
|
||||
let saved_cpu = mem.read_u8(pcr_addr);
|
||||
mem.write_u8(pcr_addr, target_cpu);
|
||||
|
||||
// Stash the previous `scheduler.current` (call_export reaches
|
||||
// it; imports the ISR calls must dispatch on the borrowed
|
||||
// thread). Restore on the way out.
|
||||
@@ -3696,6 +3742,7 @@ fn dispatch_graphics_interrupts(
|
||||
|
||||
// Restore the borrowed context.
|
||||
saved.restore(kernel.scheduler.ctx_mut_ref(target_ref));
|
||||
mem.write_u8(pcr_addr, saved_cpu);
|
||||
kernel.scheduler.current = prev_current;
|
||||
kernel.interrupts.delivered += 1;
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
"instructions": 50000000,
|
||||
"imports": 339766,
|
||||
"instructions": 50000014,
|
||||
"imports": 352251,
|
||||
"unimpl": 0,
|
||||
"draws": 0,
|
||||
"swaps": 2,
|
||||
"unique_render_targets": 0,
|
||||
"shader_blobs_live": 0,
|
||||
"texture_cache_entries": 0
|
||||
"draws": 718,
|
||||
"swaps": 147,
|
||||
"unique_render_targets": 2,
|
||||
"shader_blobs_live": 6,
|
||||
"texture_cache_entries": 1
|
||||
}
|
||||
|
||||
@@ -57,6 +57,16 @@ fn run_oracle(label: &str, max_instr: u64, golden_rel: &str) {
|
||||
&iso,
|
||||
"-n",
|
||||
&max_instr_str,
|
||||
// Pin the inline (single-threaded) GPU backend. The default
|
||||
// threaded backend drains the ring on a separate host thread,
|
||||
// so the exact instruction at which a CP interrupt is queued —
|
||||
// and therefore when the guest's swap-complete ISR callback runs
|
||||
// (iterate-2S armed it via SCRATCH_REG writeback) — varies run to
|
||||
// run. Inline draining is instruction-count-deterministic, which
|
||||
// is what a regression golden needs. (The threaded path is the
|
||||
// documented "GPU thread race" the stable-digest already warns
|
||||
// about.)
|
||||
"--gpu-inline",
|
||||
"--stable-digest",
|
||||
"--expect",
|
||||
&golden_str,
|
||||
|
||||
@@ -78,6 +78,30 @@ pub fn physical_to_backing(addr: u32) -> u32 {
|
||||
}
|
||||
}
|
||||
|
||||
/// Max guest page-version over the `[base, base+len)` span, walking 4 KiB
|
||||
/// pages via the `MemoryAccess` trait's `page_version`.
|
||||
///
|
||||
/// The concrete heap exposes an inherent `max_page_version(base, len)`, but
|
||||
/// the draw handler only holds `&dyn MemoryAccess` (which carries the coarser
|
||||
/// `page_version(addr)` accessor). This is byte-equivalent to
|
||||
/// `heap::max_page_version` and stays a pure function of the per-page write
|
||||
/// counters (no wall-clock), so texture-decode timing remains deterministic.
|
||||
fn span_max_version(mem: &dyn MemoryAccess, base: u32, len: u32) -> u64 {
|
||||
const PAGE: u32 = 0x1000;
|
||||
let last = base.saturating_add(len.saturating_sub(1));
|
||||
let mut page = base & !(PAGE - 1);
|
||||
let last_page = last & !(PAGE - 1);
|
||||
let mut max = 0u64;
|
||||
loop {
|
||||
max = max.max(mem.page_version(page));
|
||||
if page >= last_page {
|
||||
break;
|
||||
}
|
||||
page = page.wrapping_add(PAGE);
|
||||
}
|
||||
max
|
||||
}
|
||||
|
||||
/// Cached Xenos microcode blob, produced by `PM4_IM_LOAD*` packets.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ShaderBlob {
|
||||
@@ -400,6 +424,12 @@ pub struct GpuSystem {
|
||||
/// on every texture-fetch resolution; the UI thread sees the decoded
|
||||
/// bytes via `UiBridge::publish_texture`.
|
||||
pub texture_cache: crate::texture_cache::TextureCache,
|
||||
/// P5b: textures decoded at the most recent `PM4_DRAW_INDX*`, keyed off
|
||||
/// the *active* pixel shader's real `tfetch` fetch-constant slots (not a
|
||||
/// hardcoded slot). `vd_swap` publishes the first of these to the UI so
|
||||
/// the replay binds the texture the draw actually samples. Cleared and
|
||||
/// repopulated each draw; empty when the active PS issues no `tfetch`.
|
||||
pub last_draw_textures: Vec<(crate::texture_cache::TextureKey, Vec<u8>)>,
|
||||
/// 10 MiB shadow of the Xenos EDRAM. Written by clear-resolves and
|
||||
/// (future) host-render-target readback; read by the resolve byte-copy
|
||||
/// path that writes tiled pixels into guest memory. Allocated once at
|
||||
@@ -431,6 +461,7 @@ impl GpuSystem {
|
||||
rt_cache: crate::render_target_cache::RenderTargetCache::new(),
|
||||
last_resolve: None,
|
||||
texture_cache: crate::texture_cache::TextureCache::new(),
|
||||
last_draw_textures: Vec::new(),
|
||||
edram: crate::edram::ShadowEdram::new(),
|
||||
}
|
||||
}
|
||||
@@ -603,14 +634,21 @@ impl GpuSystem {
|
||||
/// Release.
|
||||
pub fn sync_with_mmio(&mut self) {
|
||||
let wptr_dwords = self.mmio.cp_rb_wptr.load(Ordering::Acquire);
|
||||
if wptr_dwords != self.ring.write_offset_dwords && self.ring.size_dwords != 0 {
|
||||
self.ring.write_offset_dwords = wptr_dwords % self.ring.size_dwords;
|
||||
// CP_RB_WPTR governs ONLY the primary ring. While an indirect buffer
|
||||
// is executing, the active `self.ring` is a fixed linear sub-stream
|
||||
// and the primary ring is saved at the bottom of the IB stack —
|
||||
// applying the (primary) write pointer to the IB would corrupt its
|
||||
// extent (e.g. `wptr % ib_size`) and strand the GPU mid-buffer.
|
||||
let primary = self.ib_stack.first_mut().unwrap_or(&mut self.ring);
|
||||
if wptr_dwords != primary.write_offset_dwords && primary.size_dwords != 0 {
|
||||
primary.write_offset_dwords = wptr_dwords % primary.size_dwords;
|
||||
}
|
||||
// Mirror our read pointer (Release pairs with any guest-side
|
||||
let primary_rptr = primary.read_offset_dwords;
|
||||
// Mirror the *primary* read pointer (Release pairs with any guest-side
|
||||
// Acquire-load of CP_RB_RPTR for ring writeback bookkeeping).
|
||||
self.mmio
|
||||
.cp_rb_rptr
|
||||
.store(self.ring.read_offset_dwords, Ordering::Release);
|
||||
.store(primary_rptr, Ordering::Release);
|
||||
}
|
||||
|
||||
/// True iff `execute_one` is expected to make progress without blocking.
|
||||
@@ -618,7 +656,11 @@ impl GpuSystem {
|
||||
if let Some(block) = &self.pending_block {
|
||||
return block.is_satisfied(mem, &self.register_file);
|
||||
}
|
||||
self.ring.has_pending()
|
||||
// Pending work may be in the active ring OR in a saved caller ring
|
||||
// further down the IB stack (an exhausted IB still needs `execute_one`
|
||||
// to pop back and resume the primary ring, whose WPTR may have since
|
||||
// advanced).
|
||||
self.ring.has_pending() || self.ib_stack.iter().any(|r| r.has_pending())
|
||||
}
|
||||
|
||||
/// Execute exactly one PM4 packet. Returns [`ExecOutcome::Idle`] when
|
||||
@@ -715,10 +757,13 @@ impl GpuSystem {
|
||||
width,
|
||||
height,
|
||||
});
|
||||
self.pending_interrupts.push(PendingInterrupt {
|
||||
source: InterruptSource::Swap,
|
||||
cpu_mask: 0x1,
|
||||
});
|
||||
// iterate-2T: do NOT raise a CP swap-complete interrupt here. Canary's
|
||||
// `VdSwap`/PM4_XE_SWAP path raises no interrupt; swap-complete CP
|
||||
// interrupts come ONLY from in-stream `PM4_INTERRUPT` packets, which
|
||||
// are naturally ordered after D3D has armed the swap-callback slot.
|
||||
// Synthesizing one out of band (as we did pre-2T) delivered a CP
|
||||
// interrupt while the slot still held the `0xBADF00D` placeholder,
|
||||
// tripping the graphics ISR's "Unanticipated CPU_INTERRUPT" assert.
|
||||
tracing::info!(
|
||||
frame = self.swap_counter,
|
||||
fb = format_args!("{frontbuffer_phys:#010x}"),
|
||||
@@ -730,13 +775,21 @@ impl GpuSystem {
|
||||
|
||||
/// Called by `VdInitializeRingBuffer` to give us the primary ring.
|
||||
pub fn initialize_ring_buffer(&mut self, base: u32, size_log2: u32) {
|
||||
let size_bytes = 1u32 << size_log2.min(31);
|
||||
// Canary `CommandProcessor::InitializeRingBuffer` (command_processor.cc:
|
||||
// 436): `primary_buffer_size_ = 1 << (size_log2 + 3)` *bytes*. The
|
||||
// `VdInitializeRingBuffer` `r4` argument is log2(size-in-quadwords),
|
||||
// so the byte size is `1 << (size_log2 + 3)` (× 8 bytes/quadword), i.e.
|
||||
// `1 << (size_log2 + 1)` dwords. (Sylpheed passes size_log2=12 →
|
||||
// 32768 bytes / 8192 dwords; the previous `1 << size_log2` undersized
|
||||
// the ring 8× and desynced WPTR wrap math from the guest.)
|
||||
let size_bytes = 1u32 << size_log2.saturating_add(3).min(31);
|
||||
// The guest hands us a bare *physical* ring base; project it onto the
|
||||
// committed backing window so ring reads hit real PM4 packets (see
|
||||
// `physical_to_backing`).
|
||||
let base = physical_to_backing(base);
|
||||
self.ring.base = base;
|
||||
self.ring.size_dwords = size_bytes / 4;
|
||||
self.ring.indirect = false;
|
||||
self.ring.read_offset_dwords = 0;
|
||||
// `write_offset` is driven by the guest — start at 0 so the ring
|
||||
// appears empty until MMIO writes advance it.
|
||||
@@ -825,6 +878,38 @@ impl GpuSystem {
|
||||
}
|
||||
}
|
||||
|
||||
/// CP scratch-register memory writeback, mirroring canary's
|
||||
/// `CommandProcessor::HandleSpecialRegisterWrite`
|
||||
/// (`command_processor.cc:545-552`). Every register write runs through
|
||||
/// here; when the target is one of the eight `SCRATCH_REG{n}`
|
||||
/// (`0x0578..=0x057F`) **and** the matching bit in `SCRATCH_UMSK` is set,
|
||||
/// the value is also written (big-endian, as `mem.write_u32` already
|
||||
/// stores) to `SCRATCH_ADDR + n*4` in guest physical memory.
|
||||
///
|
||||
/// Sylpheed arms its CP swap-complete interrupt callback through this
|
||||
/// path: it programs `SCRATCH_ADDR` to the GPU command-block descriptor
|
||||
/// (`[gfx+10772]`, runtime `0x0b1d5000`), `SCRATCH_UMSK` bit 4, then a
|
||||
/// Type-0 write of the callback PC `0x824ce2b8` into `SCRATCH_REG4`
|
||||
/// (`0x057C`). The writeback lands it at descriptor+16 (`0x4b1d5010`),
|
||||
/// which the graphics ISR (`sub_824BE9A0`) reads via `[[gfx+10772]+16]`
|
||||
/// and `bcctrl`s to fire the swap-complete callback. Without this
|
||||
/// writeback the slot stayed NULL, the ISR skipped the callback, the
|
||||
/// swap counter never advanced, and the title's per-frame manager
|
||||
/// re-fired once then plateaued.
|
||||
fn scratch_register_writeback(&self, mem: &dyn MemoryAccess, index: u32, value: u32) {
|
||||
if !(reg::SCRATCH_REG0..=reg::SCRATCH_REG7).contains(&index) {
|
||||
return;
|
||||
}
|
||||
let scratch_reg = index - reg::SCRATCH_REG0;
|
||||
let umsk = self.register_file.read(reg::SCRATCH_UMSK);
|
||||
if (1u32 << scratch_reg) & umsk == 0 {
|
||||
return;
|
||||
}
|
||||
let scratch_addr = self.register_file.read(reg::SCRATCH_ADDR);
|
||||
let mem_addr = physical_to_backing(scratch_addr.wrapping_add(scratch_reg * 4));
|
||||
mem.write_u32(mem_addr, value);
|
||||
}
|
||||
|
||||
fn writeback_read_ptr(&mut self, mem: &dyn MemoryAccess) {
|
||||
if self.ring.rptr_writeback_addr != 0 && self.ring.is_initialized() {
|
||||
mem.write_u32_fence(
|
||||
@@ -849,6 +934,7 @@ impl GpuSystem {
|
||||
let value = mem.read_u32(dword_addr);
|
||||
let target = if write_one { base_index } else { base_index + i };
|
||||
self.register_file.write(target, value);
|
||||
self.scratch_register_writeback(mem, target, value);
|
||||
}
|
||||
tracing::trace!(
|
||||
base = format_args!("{base_index:#x}"),
|
||||
@@ -871,6 +957,8 @@ impl GpuSystem {
|
||||
let b = mem.read_u32(b_addr);
|
||||
self.register_file.write(reg_index_1, a);
|
||||
self.register_file.write(reg_index_2, b);
|
||||
self.scratch_register_writeback(mem, reg_index_1, a);
|
||||
self.scratch_register_writeback(mem, reg_index_2, b);
|
||||
tracing::trace!(
|
||||
r1 = format_args!("{reg_index_1:#x}"),
|
||||
r2 = format_args!("{reg_index_2:#x}"),
|
||||
@@ -935,6 +1023,10 @@ impl GpuSystem {
|
||||
write_offset_dwords: ib_size, // IB is fully-written at jump time
|
||||
rptr_writeback_addr: 0,
|
||||
rptr_writeback_block_dwords: 0,
|
||||
// Linear sub-stream: drain [0, ib_size) then pop. Never
|
||||
// wraps, and `sync_with_mmio`'s CP_RB_WPTR must not touch
|
||||
// it (canary executes IBs through a separate reader).
|
||||
indirect: true,
|
||||
};
|
||||
tracing::debug!(
|
||||
ib_ptr = format_args!("{ib_ptr:#010x}"),
|
||||
@@ -1204,6 +1296,60 @@ impl GpuSystem {
|
||||
);
|
||||
self.last_draw = Some(ds);
|
||||
self.last_primitive = Some(processed);
|
||||
|
||||
// P5b: decode the textures the *active pixel shader* actually
|
||||
// samples. Parse the bound PS, collect its `tfetch`
|
||||
// fetch-constant slots, read each 6-dword fetch constant from
|
||||
// the register file, and decode+cache it. `vd_swap` publishes
|
||||
// the result. Empty for flat (no-tfetch) shaders — the
|
||||
// dominant case on Sylpheed's current splash, where this stays
|
||||
// inert until the textured logo draw is reached.
|
||||
self.last_draw_textures.clear();
|
||||
if let Some(ps_key) = self.active_ps_key {
|
||||
// Collect slots under an immutable borrow of `shader_blobs`,
|
||||
// then drop it before mutating `texture_cache`.
|
||||
let slots: Vec<u8> = match self.shader_blobs.get(&ps_key) {
|
||||
Some(blob) => {
|
||||
let parsed = crate::ucode::parse_shader(&blob.dwords);
|
||||
crate::shader_metrics::tfetch_slots(&parsed)
|
||||
}
|
||||
None => Vec::new(),
|
||||
};
|
||||
for slot in slots {
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (k, w) in fetch6.iter_mut().enumerate() {
|
||||
*w = self
|
||||
.register_file
|
||||
.read(CONST_BASE_FETCH + slot as u32 * 6 + k as u32);
|
||||
}
|
||||
let Some(key) = crate::texture_cache::decode_fetch_constant(fetch6) else {
|
||||
continue;
|
||||
};
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
* (bi.bytes_per_block as u32)
|
||||
/ (bi.block_w as u32);
|
||||
let version = span_max_version(mem, key.base_address, span_bytes.max(4));
|
||||
match self.texture_cache.ensure_cached(key, version, mem) {
|
||||
Ok(entry) => {
|
||||
self.last_draw_textures.push((entry.key, entry.bytes.clone()));
|
||||
metrics::counter!(
|
||||
"gpu.texture.decode",
|
||||
"fmt" => format!("{:?}", key.format),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::counter!(
|
||||
"gpu.texture.reject",
|
||||
"reason" => format!("{e:?}"),
|
||||
)
|
||||
.increment(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pm4::PM4_SET_CONSTANT | pm4::PM4_SET_SHADER_CONSTANTS => {
|
||||
// payload[0] = offset_type — bits[10:0] index, bits[23:16] type
|
||||
@@ -1483,11 +1629,31 @@ pub mod reg {
|
||||
/// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
|
||||
/// Bit 0 = VBLANK_INT_OCCURRED.
|
||||
pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
|
||||
/// `XE_GPU_REG_D1MODE_VIEWPORT_SIZE` / `AVIVO_D1MODE_VIEWPORT_SIZE`
|
||||
/// (Canary `register_table.inc:1134`). Packs the active display resolution
|
||||
/// as `(width << 16) | height` with 12-bit fields. The guest's
|
||||
/// swap-complete interrupt callback (`sub_824CE2B8`) divides by the low
|
||||
/// 12 bits (`height`) as a refresh-pacing term, so a 0 read makes its
|
||||
/// `twi` divide-by-zero guard trap and abort the ISR before it clears the
|
||||
/// swap-acknowledge fence. Canary returns the constant below from
|
||||
/// `GraphicsSystem::ReadRegister` (graphics_system.cc:311).
|
||||
pub const D1MODE_VIEWPORT_SIZE: u32 = 0x1961;
|
||||
/// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
|
||||
pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
|
||||
/// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
|
||||
/// (Canary `register_table.inc:530`).
|
||||
pub const COHER_STATUS_HOST: u32 = 0x0A31;
|
||||
/// `XE_GPU_REG_SCRATCH_UMSK` — bitmask of which `SCRATCH_REG{n}` writes are
|
||||
/// mirrored to memory (Canary `register_table.inc:139`).
|
||||
pub const SCRATCH_UMSK: u32 = 0x01DC;
|
||||
/// `XE_GPU_REG_SCRATCH_ADDR` — base physical address of the scratch
|
||||
/// writeback block (Canary `register_table.inc:141`).
|
||||
pub const SCRATCH_ADDR: u32 = 0x01DD;
|
||||
/// `XE_GPU_REG_SCRATCH_REG0` — first of 8 CP scratch registers
|
||||
/// (`0x0578..=0x057F`, Canary `register_table.inc:331-338`).
|
||||
pub const SCRATCH_REG0: u32 = 0x0578;
|
||||
/// `XE_GPU_REG_SCRATCH_REG7` — last CP scratch register.
|
||||
pub const SCRATCH_REG7: u32 = 0x057F;
|
||||
}
|
||||
|
||||
/// 32-bit FNV-1a over a u32 seed + a slice of u32s. Used to derive a
|
||||
@@ -1578,6 +1744,38 @@ mod tests {
|
||||
assert_eq!(gpu.register_file.read(0x101), 0xCAFE_BABE);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scratch_reg_write_mirrors_to_memory_when_umsk_enabled() {
|
||||
// Mirrors Sylpheed's CP swap-callback arming: SCRATCH_ADDR points at a
|
||||
// descriptor, SCRATCH_UMSK enables bit 4, and a Type-0 write of the
|
||||
// callback PC into SCRATCH_REG4 (0x57C) must land at SCRATCH_ADDR + 16.
|
||||
let mut gpu = GpuSystem::new();
|
||||
let mut mem = build_mem();
|
||||
gpu.initialize_ring_buffer(0x4000_0000, 10);
|
||||
// Program SCRATCH_ADDR = 0x4000_1000 (physical-mirror identity), and
|
||||
// SCRATCH_UMSK = bit 4 only (so SCRATCH_REG4 mirrors, REG3 does not).
|
||||
gpu.register_file.write(reg::SCRATCH_ADDR, 0x4000_1000);
|
||||
gpu.register_file.write(reg::SCRATCH_UMSK, 1 << 4);
|
||||
// Type0 write run: base = SCRATCH_REG3 (0x57B), count = 2 → writes
|
||||
// 0x11111111 → SCRATCH_REG3 (UMSK bit 3 clear), 0x824CE2B8 →
|
||||
// SCRATCH_REG4 (UMSK bit 4 set → mirrored to ADDR + 4*4 = +16).
|
||||
const SCRATCH_REG3: u32 = 0x057B;
|
||||
let hdr = (1u32 << 16) | SCRATCH_REG3;
|
||||
mem.write_u32(0x4000_0000, hdr);
|
||||
mem.write_u32(0x4000_0004, 0x1111_1111);
|
||||
mem.write_u32(0x4000_0008, 0x824C_E2B8);
|
||||
gpu.extend_write_ptr(3);
|
||||
assert!(matches!(gpu.execute_one(&mut mem), ExecOutcome::Stepped { .. }));
|
||||
// SCRATCH_REG3 (bit 3 clear) must NOT mirror; SCRATCH_REG4 (bit 4 set)
|
||||
// must mirror to SCRATCH_ADDR + 16.
|
||||
assert_eq!(mem.read_u32(0x4000_1000 + 12), 0, "reg3 must not mirror");
|
||||
assert_eq!(
|
||||
mem.read_u32(0x4000_1000 + 16),
|
||||
0x824C_E2B8,
|
||||
"reg4 must mirror to SCRATCH_ADDR+16"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wait_reg_mem_blocks_then_unblocks_when_mem_changes() {
|
||||
let mut gpu = GpuSystem::new();
|
||||
|
||||
@@ -58,6 +58,15 @@ pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
|
||||
reg::D1MODE_VBLANK_VLINE_STATUS => {
|
||||
read_vblank_status.load(Ordering::Relaxed)
|
||||
}
|
||||
// AVIVO_D1MODE_VIEWPORT_SIZE: the active display resolution
|
||||
// (1280x720) packed as `(width << 16) | height`. Canary
|
||||
// serves this constant from `GraphicsSystem::ReadRegister`
|
||||
// (graphics_system.cc:311). The guest swap-complete interrupt
|
||||
// callback divides by the low 12 bits (`height = 0x2D0`); a 0
|
||||
// read trips its `twi` divide-guard and aborts the ISR before
|
||||
// it acknowledges the per-present swap fence — which strands
|
||||
// the present/title loop. Mirror canary exactly.
|
||||
reg::D1MODE_VIEWPORT_SIZE => 0x0500_02D0,
|
||||
_ => {
|
||||
tracing::trace!(
|
||||
reg = format_args!("{reg_index:#x}"),
|
||||
|
||||
@@ -5,9 +5,8 @@
|
||||
//! rectangles) we rewrite indices on the CPU side so the host just sees a
|
||||
//! triangle list. Ground truth: `xenia-canary/src/xenia/gpu/primitive_processor.h/cc`.
|
||||
//!
|
||||
//! P3 scope: only the shapes Sylpheed's UI + early gameplay paths need
|
||||
//! (list, strip, fan). Rectangle + quad expansions are stubs logged via
|
||||
//! `tracing::warn!` for later.
|
||||
//! Scope: list, strip, fan, quad, and rectangle expansions are all handled
|
||||
//! (rectangles via CPU triangle-list rewrite — see `expand_rectangles`).
|
||||
|
||||
use crate::draw_state::{IndexSize, PrimitiveType};
|
||||
|
||||
@@ -138,18 +137,43 @@ fn expand_quads(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitiv
|
||||
}
|
||||
|
||||
/// Rectangle lists: a Xenos-specific primitive where each group of 3
|
||||
/// vertices defines a right-angle rectangle by its three non-repeated
|
||||
/// corners (the 4th is derived). The uber-shader doesn't support this yet;
|
||||
/// the ucode translator will emulate it as a geometry-stage fake. For P3
|
||||
/// we emit an empty draw.
|
||||
fn expand_rectangles(_indices: Option<&[u32]>, _vertex_count: u32) -> ProcessedPrimitive {
|
||||
tracing::warn!("gpu: rectangle list primitive not yet implemented (P3 stub)");
|
||||
metrics::counter!("gpu.primitive.rejected", "reason" => "rectangle_list").increment(1);
|
||||
/// vertices defines a rectangle; the 4th corner is extrapolated as
|
||||
/// `v3 = v0 + v2 - v1` (parallelogram completion). Canary expands this in a
|
||||
/// host vertex-shader variant (`kRectangleListAsTriangleStrip`,
|
||||
/// `primitive_processor.cc:389-456`): a 4-vertex triangle strip per rect with
|
||||
/// the 4th corner synthesized *in the VS* from the host-vertex index.
|
||||
///
|
||||
/// Our replay pipeline has no host-VS corner synthesis (and the procedural
|
||||
/// `vs_main` does not consume `rewritten_indices` yet), so we mirror the
|
||||
/// `expand_quads`/`expand_fan` CPU idiom and emit the 3 real vertices of each
|
||||
/// rect as one triangle list `(v0,v1,v2)` — the visible lower half of the
|
||||
/// rect. This un-rejects the draw and gives a faithful `host_vertex_count`.
|
||||
///
|
||||
/// TODO: once `vs_main` does real vertex fetch + interpolation, upgrade to the
|
||||
/// full quad — 6 indices `[v0,v1,v2, v2,v1,v3]` with a synthesized `v3` corner
|
||||
/// — mirroring canary's `kRectangleListAsTriangleStrip`.
|
||||
fn expand_rectangles(indices: Option<&[u32]>, vertex_count: u32) -> ProcessedPrimitive {
|
||||
let rect_count = vertex_count / 3;
|
||||
let mut out = Vec::with_capacity(3 * rect_count as usize);
|
||||
let get = |i: u32| -> u32 {
|
||||
match indices {
|
||||
Some(buf) => buf[i as usize],
|
||||
None => i,
|
||||
}
|
||||
};
|
||||
for r in 0..rect_count {
|
||||
let base = r * 3;
|
||||
out.push(get(base));
|
||||
out.push(get(base + 1));
|
||||
out.push(get(base + 2));
|
||||
}
|
||||
let host_vertex_count = out.len() as u32;
|
||||
metrics::counter!("gpu.primitive.expanded", "shape" => "rectangle_list").increment(1);
|
||||
ProcessedPrimitive {
|
||||
topology: HostTopology::TriangleList,
|
||||
rewritten_indices: Some(Vec::new()),
|
||||
host_vertex_count: 0,
|
||||
rejected: true,
|
||||
rewritten_indices: Some(out),
|
||||
host_vertex_count,
|
||||
rejected: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,6 +237,17 @@ mod tests {
|
||||
assert_eq!(idx, vec![0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rectangle_list_expansion() {
|
||||
// 2 rects (6 verts) → one triangle (v0,v1,v2) per rect, not rejected.
|
||||
let p = process(PrimitiveType::RectangleList, 6, None);
|
||||
let idx = p.rewritten_indices.unwrap();
|
||||
assert_eq!(idx, vec![0, 1, 2, 3, 4, 5]);
|
||||
assert_eq!(p.topology, HostTopology::TriangleList);
|
||||
assert_eq!(p.host_vertex_count, 6);
|
||||
assert!(!p.rejected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn widen_u16_indices_big_endian() {
|
||||
// 3 indices [1, 2, 0x1234] in BE u16.
|
||||
|
||||
@@ -32,6 +32,16 @@ pub struct RingBufferView {
|
||||
/// `VdEnableRingBufferRPtrWriteBack`). We always write back eagerly, so
|
||||
/// we don't actually use this for scheduling — kept for observability.
|
||||
pub rptr_writeback_block_dwords: u32,
|
||||
/// True for an indirect-buffer (`INDIRECT_BUFFER`) view. An IB is a fixed
|
||||
/// *linear* sub-stream, not a circular ring: it is fully written when the
|
||||
/// GPU jumps to it, so the read pointer advances monotonically from `0` to
|
||||
/// `size_dwords` and then the buffer is exhausted (the caller ring is
|
||||
/// popped). It must NOT wrap, and the primary `CP_RB_WPTR` must not be
|
||||
/// applied to it. Mirrors canary `ExecuteIndirectBuffer`, which executes
|
||||
/// the IB through a separate `RingBuffer reader_` and restores the primary
|
||||
/// reader afterward (command_processor.cc). Circular (primary-ring)
|
||||
/// semantics are used when this is `false`.
|
||||
pub indirect: bool,
|
||||
}
|
||||
|
||||
impl RingBufferView {
|
||||
@@ -46,7 +56,16 @@ impl RingBufferView {
|
||||
|
||||
/// True if there is pending unread data to consume.
|
||||
pub fn has_pending(&self) -> bool {
|
||||
self.is_initialized() && self.read_offset_dwords != self.write_offset_dwords
|
||||
if !self.is_initialized() {
|
||||
return false;
|
||||
}
|
||||
if self.indirect {
|
||||
// Linear sub-stream: exhausted once the read pointer reaches the
|
||||
// (fixed) write pointer. Never wraps.
|
||||
self.read_offset_dwords < self.write_offset_dwords
|
||||
} else {
|
||||
self.read_offset_dwords != self.write_offset_dwords
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of dwords we can consume without wrapping past the write ptr.
|
||||
@@ -54,7 +73,10 @@ impl RingBufferView {
|
||||
if !self.is_initialized() {
|
||||
return 0;
|
||||
}
|
||||
if self.write_offset_dwords >= self.read_offset_dwords {
|
||||
if self.indirect {
|
||||
self.write_offset_dwords
|
||||
.saturating_sub(self.read_offset_dwords)
|
||||
} else if self.write_offset_dwords >= self.read_offset_dwords {
|
||||
self.write_offset_dwords - self.read_offset_dwords
|
||||
} else {
|
||||
// write has wrapped — we can read up to the end of the ring.
|
||||
@@ -62,14 +84,20 @@ impl RingBufferView {
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance the read pointer by `dwords`, wrapping at `size_dwords`.
|
||||
/// Advance the read pointer by `dwords`. Circular rings wrap at
|
||||
/// `size_dwords`; an indirect buffer advances linearly (no wrap) so it
|
||||
/// terminates exactly at its fixed write pointer.
|
||||
pub fn advance_read(&mut self, dwords: u32) {
|
||||
if self.size_dwords == 0 {
|
||||
return;
|
||||
}
|
||||
if self.indirect {
|
||||
self.read_offset_dwords = self.read_offset_dwords.saturating_add(dwords);
|
||||
} else {
|
||||
self.read_offset_dwords =
|
||||
(self.read_offset_dwords + dwords) % self.size_dwords;
|
||||
}
|
||||
}
|
||||
|
||||
/// Guest address for the dword at relative offset `i` from the current
|
||||
/// read pointer. `None` if uninitialized.
|
||||
@@ -77,7 +105,11 @@ impl RingBufferView {
|
||||
if !self.is_initialized() {
|
||||
return None;
|
||||
}
|
||||
let off = (self.read_offset_dwords + offset_dwords) % self.size_dwords;
|
||||
let off = if self.indirect {
|
||||
self.read_offset_dwords.saturating_add(offset_dwords)
|
||||
} else {
|
||||
(self.read_offset_dwords + offset_dwords) % self.size_dwords
|
||||
};
|
||||
Some(self.base.wrapping_add(off.wrapping_mul(4)))
|
||||
}
|
||||
}
|
||||
@@ -120,4 +152,52 @@ mod tests {
|
||||
assert_eq!(v.addr_at_offset(1), Some(0x4000_0000));
|
||||
assert_eq!(v.addr_at_offset(2), Some(0x4000_0004));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn indirect_buffer_drains_linearly_and_terminates() {
|
||||
// An indirect buffer is a fixed linear sub-stream: read advances from
|
||||
// 0 to `size_dwords` and then is exhausted — it must NOT wrap back to
|
||||
// 0 (which previously caused an infinite re-read of a system command
|
||||
// buffer; iterate-2O). write_offset == size, exactly as the
|
||||
// INDIRECT_BUFFER handler sets it.
|
||||
let mut ib = RingBufferView {
|
||||
base: 0x4adf_5080,
|
||||
size_dwords: 11,
|
||||
read_offset_dwords: 0,
|
||||
write_offset_dwords: 11,
|
||||
rptr_writeback_addr: 0,
|
||||
rptr_writeback_block_dwords: 0,
|
||||
indirect: true,
|
||||
};
|
||||
assert!(ib.has_pending());
|
||||
// Drain the exact packet layout observed for Sylpheed's init IB:
|
||||
// 2 + 3 + 6 dwords = 11.
|
||||
ib.advance_read(2);
|
||||
assert!(ib.has_pending());
|
||||
ib.advance_read(3);
|
||||
assert!(ib.has_pending());
|
||||
ib.advance_read(6); // reaches 11 == write
|
||||
assert_eq!(ib.read_offset_dwords, 11);
|
||||
assert!(
|
||||
!ib.has_pending(),
|
||||
"indirect buffer must terminate at write ptr, not wrap to 0"
|
||||
);
|
||||
// addr_at_offset must not modulo-wrap for an indirect buffer.
|
||||
ib.read_offset_dwords = 9;
|
||||
assert_eq!(ib.addr_at_offset(1), Some(0x4adf_5080 + 10 * 4));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn indirect_flag_does_not_affect_circular_ring() {
|
||||
// Sanity: a circular (primary) ring still wraps as before.
|
||||
let mut v = RingBufferView::new();
|
||||
v.base = 0x4adc_c000;
|
||||
v.size_dwords = 8192;
|
||||
v.read_offset_dwords = 8190;
|
||||
v.write_offset_dwords = 2;
|
||||
assert!(v.has_pending());
|
||||
v.advance_read(4); // (8190 + 4) % 8192 = 2
|
||||
assert_eq!(v.read_offset_dwords, 2);
|
||||
assert!(!v.has_pending());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,8 +45,9 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
// sequence bit layout: 2 bits per triple, hi bit = is-fetch.
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(_) => vfetch_count += 1,
|
||||
@@ -174,6 +175,50 @@ pub fn emit_for(parsed: &ParsedShader, stage: &'static str) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect the unique texture-fetch-constant slot indices a shader samples.
|
||||
///
|
||||
/// Walks the same exec-clause / sequence-bitmap path as [`emit_for`] but only
|
||||
/// extracts `TextureFetch.fetch_const` slots, deduplicated and in first-seen
|
||||
/// order. The GPU draw handler uses this to decide which fetch constants to
|
||||
/// decode + cache at draw time (keyed off the *active* pixel shader's real
|
||||
/// `tfetch` instructions rather than a hardcoded slot).
|
||||
pub fn tfetch_slots(parsed: &ParsedShader) -> Vec<u8> {
|
||||
let mut slots: Vec<u8> = Vec::new();
|
||||
for clause in &parsed.cf {
|
||||
if let ControlFlowInstruction::Exec {
|
||||
address,
|
||||
count,
|
||||
sequence,
|
||||
..
|
||||
} = clause
|
||||
{
|
||||
for i in 0..(*count as usize) {
|
||||
let base = (*address as usize + i) * 3;
|
||||
if base + 2 >= parsed.instructions.len() {
|
||||
break;
|
||||
}
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if !is_fetch {
|
||||
continue;
|
||||
}
|
||||
let words = [
|
||||
parsed.instructions[base],
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
if let FetchInstruction::Texture(tf) = decode_fetch(words) {
|
||||
if !slots.contains(&tf.fetch_const) {
|
||||
slots.push(tf.fetch_const);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
slots
|
||||
}
|
||||
|
||||
fn mark_feature(buf: &mut Vec<&'static str>, name: &'static str) {
|
||||
if !buf.contains(&name) {
|
||||
buf.push(name);
|
||||
@@ -298,6 +343,46 @@ mod tests {
|
||||
emit_for(&shader, "vs");
|
||||
}
|
||||
|
||||
/// `tfetch_slots` should extract the fetch-constant slot of a texture
|
||||
/// fetch (and dedup), and return empty for a flat ALU-only shader.
|
||||
#[test]
|
||||
fn tfetch_slots_extracts_texture_fetch_constants() {
|
||||
// word0: opcode TEXTURE_FETCH (0x01) in low 5 bits, const_index=3 in
|
||||
// bits[24:20] (Xenos `ucode.h:844`) → 0x01 | (3 << 20).
|
||||
let tfetch_w0: u32 = 0x01 | (3u32 << 20);
|
||||
let shader = ParsedShader {
|
||||
cf: vec![
|
||||
ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 2,
|
||||
// instruction 0 is a fetch (bit[0] of its 2-bit field set),
|
||||
// instruction 1 is ALU. is_fetch = (sequence >> (i*2)) & 1.
|
||||
sequence: 0b00_01,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
},
|
||||
ControlFlowInstruction::Exit,
|
||||
],
|
||||
instructions: vec![tfetch_w0, 0, 0, /* ALU triple */ 0, 0, 0],
|
||||
};
|
||||
assert_eq!(tfetch_slots(&shader), vec![3]);
|
||||
|
||||
// Flat shader: no fetch bits → no slots.
|
||||
let flat = ParsedShader {
|
||||
cf: vec![ControlFlowInstruction::Exec {
|
||||
address: 0,
|
||||
count: 1,
|
||||
sequence: 0,
|
||||
is_end: false,
|
||||
predicated: false,
|
||||
predicate_condition: false,
|
||||
}],
|
||||
instructions: vec![0, 0, 0],
|
||||
};
|
||||
assert!(tfetch_slots(&flat).is_empty());
|
||||
}
|
||||
|
||||
/// P8: a shader containing `LoopStart` should mark `cf_loop` as used
|
||||
/// so the HUD can surface which deferred feature a game triggers.
|
||||
#[test]
|
||||
|
||||
@@ -56,6 +56,7 @@ const CF_KIND_LOOP_END: u32 = 5u;
|
||||
const CF_KIND_COND_JMP: u32 = 6u;
|
||||
const CF_KIND_COND_CALL: u32 = 7u;
|
||||
const CF_KIND_RETURN: u32 = 8u;
|
||||
const CF_KIND_NOP: u32 = 9u;
|
||||
const CF_KIND_UNKNOWN: u32 = 15u;
|
||||
|
||||
// ── Alloc-kind codes (mirrors `xenia_gpu::ucode::cf_alloc_kind`). ──────
|
||||
@@ -628,8 +629,8 @@ const VFMT_32_32_32_FLOAT: u32 = 57u;
|
||||
// layout in `ucode.h:690`):
|
||||
// w0 [4:0] opcode
|
||||
// w0 [10:5] src_reg[5:0]
|
||||
// w0 [17:11] dst_reg[6:0] + must-be-one
|
||||
// w0 [21:17] const_index[4:0], [23:22] const_index_sel[1:0]
|
||||
// w0 [17:12] dst_reg[5:0]
|
||||
// w0 [24:20] const_index[4:0], [26:25] const_index_sel[1:0]
|
||||
// w1 [21:16] format[5:0]
|
||||
// w2 [7:0] stride (in dwords)
|
||||
// w2 [30:8] offset (signed, in dwords)
|
||||
@@ -641,9 +642,9 @@ fn interpret_vertex_fetch(t: u32) {
|
||||
let w0 = vs_instr_dword(t, 0u);
|
||||
let w1 = vs_instr_dword(t, 1u);
|
||||
let w2 = vs_instr_dword(t, 2u);
|
||||
let fetch_const = (w0 >> 5u) & 0x1Fu;
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let fetch_const = (w0 >> 20u) & 0x1Fu;
|
||||
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||
let format = (w1 >> 16u) & 0x3Fu;
|
||||
let stride = w2 & 0xFFu;
|
||||
|
||||
@@ -773,20 +774,20 @@ fn interpret_texture_fetch(t: u32, is_vertex: bool) {
|
||||
} else {
|
||||
w0 = ps_instr_dword(t, 0u);
|
||||
}
|
||||
let dst_reg = (w0 >> 10u) & 0x7Fu;
|
||||
let src_reg = (w0 >> 17u) & 0x7Fu;
|
||||
let uv = registers[src_reg & 0x7Fu].xy;
|
||||
let dst_reg = (w0 >> 12u) & 0x3Fu;
|
||||
let src_reg = (w0 >> 5u) & 0x3Fu;
|
||||
let uv = registers[src_reg & 0x3Fu].xy;
|
||||
let sample = textureSampleLevel(xenos_tex, xenos_samp, uv, 0.0);
|
||||
registers[dst_reg & 0x7Fu] = sample;
|
||||
registers[dst_reg & 0x3Fu] = sample;
|
||||
}
|
||||
|
||||
// Walk an Exec clause's instruction triples.
|
||||
// sequence: 2-bit-per-triple bitmap. Bit 0 of a pair = serialize flag
|
||||
// (we ignore in MVP); bit 1 = is-fetch.
|
||||
// sequence: 2-bit-per-instruction bitmap. Bit 0 of a pair = fetch(1)/ALU(0);
|
||||
// bit 1 = serialize (ignored). (Xenos `ucode.h:226`.)
|
||||
fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
let opcode = vs_instr_dword(t, 0u) & 0x1Fu;
|
||||
// 0x00 = vertex fetch, 0x01 = texture fetch.
|
||||
@@ -803,7 +804,7 @@ fn exec_vs(address: u32, count: u32, sequence: u32) {
|
||||
fn exec_ps(address: u32, count: u32, sequence: u32) {
|
||||
for (var i: u32 = 0u; i < count; i = i + 1u) {
|
||||
let t = address + i;
|
||||
let is_fetch = ((sequence >> (i * 2u + 1u)) & 1u) != 0u;
|
||||
let is_fetch = ((sequence >> (i * 2u)) & 1u) != 0u;
|
||||
if is_fetch {
|
||||
interpret_texture_fetch(t, false);
|
||||
} else {
|
||||
@@ -962,6 +963,9 @@ fn walk_cf_vs() {
|
||||
// No call stack — mark and continue.
|
||||
reject_mask |= REJECT_CF_CALL;
|
||||
}
|
||||
case CF_KIND_NOP: {
|
||||
// kNop padding / kMarkVsFetchDone hint — no-op, just advance.
|
||||
}
|
||||
default: { reject_mask |= REJECT_CF_JUMP; }
|
||||
}
|
||||
if stop { break; }
|
||||
|
||||
@@ -237,6 +237,10 @@ impl EmitCtx {
|
||||
current_alloc = *kind;
|
||||
}
|
||||
ControlFlowInstruction::Exit => break,
|
||||
// Non-executing CF clauses: padding (`kNop`) and the
|
||||
// vertex-fetch-done hint (`kMarkVsFetchDone`). Skip them.
|
||||
ControlFlowInstruction::Nop
|
||||
| ControlFlowInstruction::MarkVsFetchDone => {}
|
||||
ControlFlowInstruction::LoopStart { .. }
|
||||
| ControlFlowInstruction::LoopEnd { .. } => return Err(reject::CF_LOOP),
|
||||
ControlFlowInstruction::CondJmp { .. } => return Err(reject::CF_COND),
|
||||
@@ -284,7 +288,9 @@ impl EmitCtx {
|
||||
parsed.instructions[base + 1],
|
||||
parsed.instructions[base + 2],
|
||||
];
|
||||
let is_fetch = ((sequence >> (i * 2 + 1)) & 1) != 0;
|
||||
// sequence: 2 bits per instruction — bit[0]=fetch(1)/ALU(0),
|
||||
// bit[1]=serialize (Xenos `ucode.h:226`).
|
||||
let is_fetch = ((sequence >> (i * 2)) & 1) != 0;
|
||||
if is_fetch {
|
||||
match decode_fetch(words) {
|
||||
FetchInstruction::Vertex(vf) => self.emit_vfetch(&vf)?,
|
||||
|
||||
@@ -43,7 +43,15 @@ pub enum ControlFlowInstruction {
|
||||
Return,
|
||||
/// `kAlloc` — pre-allocate export registers (position, interpolators, colors).
|
||||
Alloc { size: u32, kind: AllocKind },
|
||||
/// Exit the shader (terminal).
|
||||
/// `kNop` — fills space in the CF block; executes nothing, does not end
|
||||
/// the shader. (Xenos opcode 0.)
|
||||
Nop,
|
||||
/// `kMarkVsFetchDone` — hint that no more vertex fetches will be performed.
|
||||
/// (Xenos opcode 15.) Non-terminating.
|
||||
MarkVsFetchDone,
|
||||
/// Exit the shader (terminal). Synthesized — Xenos has no dedicated exit
|
||||
/// opcode; the shader ends after an `Exec`/`CondExec` clause with the
|
||||
/// END bit set (`is_end`). Retained for callers/tests that reference it.
|
||||
Exit,
|
||||
/// Unknown / unhandled opcode.
|
||||
Unknown { opcode: u8 },
|
||||
@@ -93,37 +101,45 @@ fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
let predicated = ((payload >> 28) & 1) != 0;
|
||||
let predicate_condition = ((payload >> 29) & 1) != 0;
|
||||
|
||||
// Xenos `ControlFlowOpcode` (canary `ucode.h:86-160`):
|
||||
// 0 kNop, 1 kExec, 2 kExecEnd, 3 kCondExec, 4 kCondExecEnd,
|
||||
// 5 kCondExecPred, 6 kCondExecPredEnd, 7 kLoopStart, 8 kLoopEnd,
|
||||
// 9 kCondCall, 10 kReturn, 11 kCondJmp, 12 kAlloc,
|
||||
// 13 kCondExecPredClean, 14 kCondExecPredCleanEnd, 15 kMarkVsFetchDone.
|
||||
// All exec variants share the address(12)/count(3)/sequence(12) layout
|
||||
// of `ControlFlowExecInstruction`; the `*End` variants terminate the
|
||||
// shader. (Prior table was off-by-one — it mapped 0→Exec and 1→Exit,
|
||||
// so a real `kExec` clause was misread as a terminal `Exit`, truncating
|
||||
// the CF block and dropping every `tfetch` in it.)
|
||||
let exec = |is_end: bool| ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
};
|
||||
match opcode {
|
||||
0 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: false,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
1 => ControlFlowInstruction::Exit,
|
||||
2 => ControlFlowInstruction::Exec {
|
||||
address: (payload & 0xFFF) as u32,
|
||||
count: ((payload >> 12) & 0x7) as u32,
|
||||
sequence: ((payload >> 16) & 0xFFF) as u32,
|
||||
is_end: true,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
},
|
||||
6 => ControlFlowInstruction::LoopStart {
|
||||
0 => ControlFlowInstruction::Nop,
|
||||
1 => exec(false),
|
||||
2 => exec(true),
|
||||
3 => exec(false),
|
||||
4 => exec(true),
|
||||
5 => exec(false),
|
||||
6 => exec(true),
|
||||
7 => ControlFlowInstruction::LoopStart {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
7 => ControlFlowInstruction::LoopEnd {
|
||||
8 => ControlFlowInstruction::LoopEnd {
|
||||
address: (payload & 0x3FF) as u32,
|
||||
loop_id: ((payload >> 16) & 0x1F) as u32,
|
||||
},
|
||||
8 => ControlFlowInstruction::CondCall {
|
||||
9 => ControlFlowInstruction::CondCall {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
},
|
||||
9 => ControlFlowInstruction::Return,
|
||||
10 => ControlFlowInstruction::CondJmp {
|
||||
10 => ControlFlowInstruction::Return,
|
||||
11 => ControlFlowInstruction::CondJmp {
|
||||
target: (payload & 0x3FF) as u32,
|
||||
predicated,
|
||||
predicate_condition,
|
||||
@@ -132,6 +148,9 @@ fn decode_single(payload: u64) -> ControlFlowInstruction {
|
||||
size: (payload & 0x7) as u32,
|
||||
kind: AllocKind::from_bits(((payload >> 4) & 0x7) as u32),
|
||||
},
|
||||
13 => exec(false),
|
||||
14 => exec(true),
|
||||
15 => ControlFlowInstruction::MarkVsFetchDone,
|
||||
other => ControlFlowInstruction::Unknown { opcode: other },
|
||||
}
|
||||
}
|
||||
@@ -141,12 +160,49 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn opcode_exit_decodes() {
|
||||
// opcode 1 (Exit) in bits 44..47 of A's 48-bit payload.
|
||||
fn opcode_nop_and_exec_decode() {
|
||||
// Xenos opcode 0 = kNop (non-terminating padding).
|
||||
let payload: u64 = 0u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
assert_eq!(decode_cf_pair(hi, lo, 0).0, ControlFlowInstruction::Nop);
|
||||
// Xenos opcode 1 = kExec (executes instructions; NOT a terminal exit).
|
||||
let payload: u64 = 1u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
let cf = decode_cf_pair(hi, lo, 0).0;
|
||||
assert_eq!(cf, ControlFlowInstruction::Exit);
|
||||
match decode_cf_pair(hi, lo, 0).0 {
|
||||
ControlFlowInstruction::Exec { is_end, .. } => assert!(!is_end),
|
||||
other => panic!("opcode 1 should be non-end Exec, got {other:?}"),
|
||||
}
|
||||
// Xenos opcode 15 = kMarkVsFetchDone (non-terminating hint).
|
||||
let payload: u64 = 15u64 << 44;
|
||||
let (hi, lo) = ((payload & 0xFFFF_FFFF) as u32, ((payload >> 32) & 0xFFFF) as u32);
|
||||
assert_eq!(
|
||||
decode_cf_pair(hi, lo, 0).0,
|
||||
ControlFlowInstruction::MarkVsFetchDone
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn real_logo_shader_has_tfetch_clauses() {
|
||||
// The publisher-logo pixel shader E59B2B3DA4AA9008 (captured from the
|
||||
// canary oracle, byte-identical to the microcode our guest IM_LOADs).
|
||||
// Regression for iterate-3M: the old off-by-one opcode table decoded
|
||||
// its leading `kExec` (opcode 1) as a terminal `Exit`, truncating the
|
||||
// CF block so the `tfetch2D` never appeared → flat splash.
|
||||
let ucode: [u32; 24] = [
|
||||
0x00011002, 0x00001200, 0xC4000000, 0x00004003, 0x00002200, 0x00000000,
|
||||
0x10082021, 0x1F1FF688, 0x00004000, 0xC8080001, 0x001B1B00, 0xC1020000,
|
||||
0xC8070000, 0x00C0C000, 0xC1020000, 0xC8070001, 0x00C01B00, 0xC1000100,
|
||||
0xC80F8000, 0x00000000, 0xC2010100, 0x00000000, 0x00000000, 0x00000000,
|
||||
];
|
||||
let p = crate::ucode::parse_shader(&ucode);
|
||||
let exec_clauses = p
|
||||
.cf
|
||||
.iter()
|
||||
.filter(|c| matches!(c, ControlFlowInstruction::Exec { .. }))
|
||||
.count();
|
||||
assert!(exec_clauses >= 1, "expected >=1 Exec clause, cf={:?}", p.cf);
|
||||
let slots = crate::shader_metrics::tfetch_slots(&p);
|
||||
assert!(!slots.is_empty(), "expected tfetch slots, got none; cf={:?}", p.cf);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -54,23 +54,32 @@ pub mod op {
|
||||
}
|
||||
|
||||
pub fn decode_fetch(words: [u32; 3]) -> FetchInstruction {
|
||||
// Fetch dword0 bitfields (Xenos `ucode.h:740-749` vfetch / `844-845`
|
||||
// tfetch): opcode_value:5, src_reg:6, src_reg_am:1, dst_reg:6,
|
||||
// dst_reg_am:1, (fetch_valid_only|must_be_one):1, const_index:5 @ bit20,
|
||||
// ... The prior decoder read `const_index` from bit 5 (which is actually
|
||||
// `src_reg`), so every fetch reported the wrong fetch-constant slot — the
|
||||
// logo `tfetch2D ..., tf0` was read as `tf1`, and slot 1's empty constant
|
||||
// failed to decode → no texture. The texture-fetch `dimension` lives in
|
||||
// dword2 bits 14..15, not dword1.
|
||||
let w0 = words[0];
|
||||
let w1 = words[1];
|
||||
let w2 = words[2];
|
||||
let opcode = (w0 & 0x1F) as u8;
|
||||
match opcode {
|
||||
op::VERTEX_FETCH => FetchInstruction::Vertex(VertexFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||
dest_write_mask: (w1 & 0xF) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
op::TEXTURE_FETCH => FetchInstruction::Texture(TextureFetch {
|
||||
fetch_const: ((w0 >> 5) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 17) & 0x7F) as u8,
|
||||
dest_register: ((w0 >> 10) & 0x7F) as u8,
|
||||
dest_write_mask: ((w1 >> 23) & 0xF) as u8,
|
||||
dimension: ((w1 >> 29) & 0x3) as u8,
|
||||
fetch_const: ((w0 >> 20) & 0x1F) as u8,
|
||||
src_register: ((w0 >> 5) & 0x3F) as u8,
|
||||
dest_register: ((w0 >> 12) & 0x3F) as u8,
|
||||
dest_write_mask: (w1 & 0xF) as u8,
|
||||
dimension: ((w2 >> 14) & 0x3) as u8,
|
||||
raw: words,
|
||||
}),
|
||||
_ => FetchInstruction::Unknown { opcode, raw: words },
|
||||
@@ -83,8 +92,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn decode_vertex_fetch() {
|
||||
// opcode=0 (vertex), fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (5 << 5) | (7 << 10) | (2 << 17);
|
||||
// opcode=0 (vertex). Xenos dword0: src_reg@bit5, dst_reg@bit12,
|
||||
// const_index@bit20. fetch_const=5, src=2, dest=7.
|
||||
let w0 = 0u32 | (2 << 5) | (7 << 12) | (5 << 20);
|
||||
let v = decode_fetch([w0, 0, 0]);
|
||||
match v {
|
||||
FetchInstruction::Vertex(vf) => {
|
||||
@@ -98,11 +108,16 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn decode_texture_fetch() {
|
||||
let w0 = 1u32 | (3 << 5) | (4 << 10) | (1 << 17);
|
||||
let t = decode_fetch([w0, (2u32 << 29), 0]);
|
||||
// opcode=1 (texture). const_index@bit20=3, src@bit5=1, dst@bit12=4.
|
||||
// dimension lives in dword2 bits 14..15.
|
||||
let w0 = 1u32 | (1 << 5) | (4 << 12) | (3 << 20);
|
||||
let w2 = 2u32 << 14;
|
||||
let t = decode_fetch([w0, 0, w2]);
|
||||
match t {
|
||||
FetchInstruction::Texture(tf) => {
|
||||
assert_eq!(tf.fetch_const, 3);
|
||||
assert_eq!(tf.src_register, 1);
|
||||
assert_eq!(tf.dest_register, 4);
|
||||
assert_eq!(tf.dimension, 2);
|
||||
}
|
||||
other => panic!("expected Texture, got {other:?}"),
|
||||
|
||||
@@ -48,6 +48,9 @@ pub mod cf_kind {
|
||||
pub const COND_JMP: u32 = 6;
|
||||
pub const COND_CALL: u32 = 7;
|
||||
pub const RETURN: u32 = 8;
|
||||
/// Non-executing CF clause: `kNop` padding or `kMarkVsFetchDone` hint.
|
||||
/// The WGSL CF walker treats this as a no-op (advance, do not reject).
|
||||
pub const NOP: u32 = 9;
|
||||
pub const UNKNOWN: u32 = 15;
|
||||
}
|
||||
|
||||
@@ -136,6 +139,7 @@ fn encode_cf(c: ControlFlowInstruction) -> (u32, u32, u32) {
|
||||
}
|
||||
CondCall { target } => (cf_kind::COND_CALL, target, 0),
|
||||
Return => (cf_kind::RETURN, 0, 0),
|
||||
Nop | MarkVsFetchDone => (cf_kind::NOP, 0, 0),
|
||||
Unknown { opcode } => (cf_kind::UNKNOWN, opcode as u32, 0),
|
||||
}
|
||||
}
|
||||
@@ -164,9 +168,11 @@ pub struct ParsedShader {
|
||||
}
|
||||
|
||||
/// Decode a shader blob. `raw_dwords` is a host-endian slice of the entire
|
||||
/// microcode buffer (control flow + instructions). Heuristic: CF dword count
|
||||
/// is encoded in the first word's low 12 bits of the last exec clause —
|
||||
/// canary iterates until it hits a clause of kind `Exit`. We do the same.
|
||||
/// microcode buffer (control flow + instructions). The CF block is implicitly
|
||||
/// bounded: we walk clause-pair rows until one terminates the shader (an
|
||||
/// `Exec`/`CondExec` clause with the END bit set, per Xenos). Everything after
|
||||
/// that row is the instruction block; exec/loop addresses are then rebased to
|
||||
/// be relative to it.
|
||||
pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
let mut cf = Vec::new();
|
||||
// CF clauses are 48-bit (word1 lo 16 + word0 = 48 or so per canary's
|
||||
@@ -175,22 +181,50 @@ pub fn parse_shader(raw_dwords: &[u32]) -> ParsedShader {
|
||||
while i + 2 < raw_dwords.len() {
|
||||
let a = decode_cf_pair(raw_dwords[i], raw_dwords[i + 1], raw_dwords[i + 2]);
|
||||
let (first, second) = a;
|
||||
let seen_exit = matches!(
|
||||
first,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
) || matches!(
|
||||
second,
|
||||
ControlFlowInstruction::Exit | ControlFlowInstruction::Unknown { .. }
|
||||
);
|
||||
// The CF block ends after the clause that terminates the shader: an
|
||||
// `Exec` with the END bit set (Xenos `kExecEnd`/`kCondExec*End`), a
|
||||
// synthetic `Exit`, or an `Unknown` opcode (decode ran off the CF
|
||||
// block into instruction data — stop defensively). `Nop` padding
|
||||
// does NOT terminate. (Previously this stopped on the first `Exit`,
|
||||
// but with the corrected opcode table opcode 1 is `kExec`, not exit,
|
||||
// so real exec clauses kept the parse going as intended.)
|
||||
let terminates = |cf: &ControlFlowInstruction| {
|
||||
matches!(
|
||||
cf,
|
||||
ControlFlowInstruction::Exec { is_end: true, .. }
|
||||
| ControlFlowInstruction::Exit
|
||||
| ControlFlowInstruction::Unknown { .. }
|
||||
)
|
||||
};
|
||||
let seen_end = terminates(&first) || terminates(&second);
|
||||
cf.push(first);
|
||||
cf.push(second);
|
||||
i += 3;
|
||||
if seen_exit {
|
||||
if seen_end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Everything after `i` dwords is the instruction block.
|
||||
let instructions = raw_dwords[i..].to_vec();
|
||||
// Xenos exec/loop `address` fields are absolute instruction-triple indices
|
||||
// counted from shader dword 0, but `instructions` here begins *after* the
|
||||
// CF block. Rebase those addresses to be relative to the instruction block
|
||||
// (subtract the CF triple count) so `address * 3` indexes `instructions`
|
||||
// directly. (Without this, every exec read 3 dwords too far per CF triple —
|
||||
// the publisher-logo `tfetch` triple was skipped → flat splash.)
|
||||
let cf_triples = (i / 3) as u32;
|
||||
for clause in cf.iter_mut() {
|
||||
match clause {
|
||||
ControlFlowInstruction::Exec { address, .. } => {
|
||||
*address = address.saturating_sub(cf_triples);
|
||||
}
|
||||
ControlFlowInstruction::LoopStart { address, .. }
|
||||
| ControlFlowInstruction::LoopEnd { address, .. } => {
|
||||
*address = address.saturating_sub(cf_triples);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
ParsedShader { cf, instructions }
|
||||
}
|
||||
|
||||
@@ -235,15 +269,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn trivial_exit_clause_stops_parsing() {
|
||||
// Two clauses: [NOP (kind=0), EXIT (kind=1)] encoded per canary.
|
||||
// Exit clause is opcode 1 in the top 4 bits of the upper 16 bits.
|
||||
let w0 = 0u32; // clause A body
|
||||
let w1 = (1u32 << 12) << 16; // upper 16 bits = 0x1000 → opcode=1 (EXIT) for clause A
|
||||
let w2 = 0u32;
|
||||
let p = parse_shader(&[w0, w1, w2, 0xDEAD_BEEF]);
|
||||
fn exec_end_clause_stops_parsing() {
|
||||
// Row: clause B = kExecEnd (opcode 2) terminates the CF block.
|
||||
// 48-bit payload of B occupies hi16(word1) + word2; opcode lives in
|
||||
// bits 44..47 of that payload. Put opcode 2 there: payload bit 44 set
|
||||
// for the `2` → (2 << 44). In B's framing, bits 16..47 come from
|
||||
// word2, so word2 bit (44-16)=28 region holds the opcode nibble.
|
||||
let b_payload: u64 = 2u64 << 44; // kExecEnd
|
||||
// B = lo16 from hi16(word1), hi from word2. Reconstruct word1/word2.
|
||||
let word1 = ((b_payload & 0xFFFF) as u32) << 16; // B's low 16 bits → hi16(word1)
|
||||
let word2 = ((b_payload >> 16) & 0xFFFF_FFFF) as u32;
|
||||
let p = parse_shader(&[0, word1, word2, 0xDEAD_BEEF]);
|
||||
assert!(!p.cf.is_empty());
|
||||
// Exit detected → remaining dword is instruction data.
|
||||
// ExecEnd detected in the first row → remaining dword is instruction data.
|
||||
assert_eq!(p.instructions, vec![0xDEAD_BEEF]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1652,6 +1652,79 @@ fn nt_set_information_file(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut
|
||||
return;
|
||||
}
|
||||
|
||||
// XFileRenameInformation (10): move the backing file to a new path.
|
||||
// Sylpheed's asset-cache decompresses each packed resource to a staging
|
||||
// `cache:\<hash><tail>.tmp` then renames it into its final nested path
|
||||
// `cache:\<hash>\<dir>\<file>`. Without an actual host-FS rename the
|
||||
// nested target stays empty, the later read-back of the decompressed
|
||||
// asset (e.g. the title logo texture `\69d8e45c\e\534ffea`) misses, and
|
||||
// the logo never loads. Mirror canary `xboxkrnl_io_info.cc:226`
|
||||
// (`X_FILE_RENAME_INFORMATION{ replace_existing@0, root_dir_handle@4,
|
||||
// ansi_string@8 }` → `file->Rename(TranslateAnsiPath(ansi_string))`).
|
||||
if info_class == 10 {
|
||||
// Read the target path from the embedded ANSI_STRING at info_ptr+8.
|
||||
let target_raw = match crate::path::read_ansi_string(mem, info_ptr + 8) {
|
||||
Some(s) if !s.is_empty() => s,
|
||||
_ => {
|
||||
const STATUS_OBJECT_NAME_INVALID: u64 = 0xC000_0033;
|
||||
ctx.gpr[3] = STATUS_OBJECT_NAME_INVALID;
|
||||
return;
|
||||
}
|
||||
};
|
||||
// Resolve the destination against the host cache backing dir. We only
|
||||
// support renames within the writable `cache:` mount (the only place
|
||||
// a guest can create files); disc/synth entries are read-only.
|
||||
let new_host = state.resolve_cache_path(&target_raw);
|
||||
// Current backing host path of the handle.
|
||||
let old_host = match state.objects.get(&handle) {
|
||||
Some(KernelObject::File { host_path: Some(hp), .. }) => Some(hp.clone()),
|
||||
Some(KernelObject::File { .. }) => None,
|
||||
_ => {
|
||||
ctx.gpr[3] = STATUS_INVALID_HANDLE;
|
||||
return;
|
||||
}
|
||||
};
|
||||
let status: u64 = match (old_host, new_host) {
|
||||
(Some(old), Some(new)) => {
|
||||
if let Some(parent) = new.parent() {
|
||||
let _ = std::fs::create_dir_all(parent);
|
||||
}
|
||||
match std::fs::rename(&old, &new) {
|
||||
Ok(()) => {
|
||||
// Update the handle so subsequent I/O targets the new
|
||||
// host path + guest path.
|
||||
if let Some(KernelObject::File { path, host_path, .. }) =
|
||||
state.objects.get_mut(&handle)
|
||||
{
|
||||
*path = crate::path::normalize_path(&target_raw);
|
||||
*host_path = Some(new.clone());
|
||||
}
|
||||
tracing::info!(
|
||||
"NtSetInformationFile rename cache {:?} -> {:?} ({:?})",
|
||||
old, new, target_raw
|
||||
);
|
||||
STATUS_SUCCESS
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"NtSetInformationFile rename {:?} -> {:?} failed: {}",
|
||||
old, new, e
|
||||
);
|
||||
STATUS_UNSUCCESSFUL
|
||||
}
|
||||
}
|
||||
}
|
||||
// Non-cache (read-only VFS) source/target: acknowledge without a
|
||||
// host move, matching the prior permissive behaviour.
|
||||
_ => STATUS_SUCCESS,
|
||||
};
|
||||
if iosb_ptr != 0 {
|
||||
write_io_status_block(mem, iosb_ptr, status as u32, info_length);
|
||||
}
|
||||
ctx.gpr[3] = status;
|
||||
return;
|
||||
}
|
||||
|
||||
// Handle lookup.
|
||||
let Some(KernelObject::File { size, position, host_path, .. }) = state.objects.get_mut(&handle) else {
|
||||
ctx.gpr[3] = STATUS_INVALID_HANDLE;
|
||||
@@ -2883,10 +2956,12 @@ fn vd_initialize_ring_buffer(ctx: &mut PpcContext, _mem: &GuestMemory, state: &m
|
||||
// packets directly into ring memory at the current WPTR (the GPU
|
||||
// backend lives on a worker thread under `--gpu-thread` so we can't
|
||||
// read its `ring.base` from the kernel side without a channel hop).
|
||||
// Per canary: size_log2 is log2(size in BYTES), so size in dwords =
|
||||
// 2^size_log2 / 4 = 1 << (size_log2 - 2).
|
||||
// Per canary `CommandProcessor::InitializeRingBuffer`: the ring is
|
||||
// `1 << (size_log2 + 3)` bytes = `1 << (size_log2 + 1)` dwords (`r4` is
|
||||
// log2 of the size in quadwords). Kept in sync with
|
||||
// `GpuSystem::initialize_ring_buffer`. (Currently bookkeeping-only.)
|
||||
state.ring_base = ptr;
|
||||
state.ring_size_dwords = if size_log2 >= 2 { 1u32 << (size_log2 - 2) } else { 0 };
|
||||
state.ring_size_dwords = 1u32 << (size_log2 + 1);
|
||||
ctx.gpr[3] = 0;
|
||||
}
|
||||
|
||||
@@ -2997,53 +3072,87 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
// xboxkrnl_video.cc:479. Currently skipped (see below).
|
||||
let _ = fetch_dwords; // silence unused — will be live again under the deferred path
|
||||
|
||||
// The original M2b path zero-filled buffer_ptr (in the system command
|
||||
// buffer) and bumped WPTR by 64 to expose the game's own ring writes.
|
||||
// Keep that untouched — the game still expects buffer_ptr to be a
|
||||
// skippable scratch area, and the bump still exposes any game-batched
|
||||
// PM4 packets for the drain.
|
||||
// iterate-2V: mirror xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:518-548)
|
||||
// FAITHFULLY. The game reserves 64 dwords (256 bytes) in the primary ring
|
||||
// at `buffer_ptr`; canary writes a `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
||||
// fetch-constant patch followed by `PM4_TYPE3(PM4_XE_SWAP)`, then pads with
|
||||
// NOPs — and **NEVER touches `CP_RB_WPTR`**. The game advances the primary
|
||||
// ring write-pointer itself via its own doorbell once it has finished
|
||||
// populating the reserved slot, so VdSwap only fills the bytes.
|
||||
//
|
||||
// iterate-2V FIX (the bug this removes): a prior revision bumped the
|
||||
// primary ring `CP_RB_WPTR` out-of-band here (`extend_write_ptr_by(64)`).
|
||||
// But `buffer_ptr` (~0x4add6efc) is NOT inside the primary ring (base
|
||||
// ~0x4adcd000, 8192 dwords) — it lives ~10k dwords past it, in the
|
||||
// renderer indirect-buffer region. The bogus WPTR bump pushed the GPU
|
||||
// read-pointer PAST the guest's real write-pointer, the drain treated the
|
||||
// overshoot as a circular wrap, and **re-executed the splash's draw
|
||||
// indirect-buffers ~2×** — inflating draws to 78 (real splash ≈ 28; 12
|
||||
// INDIRECT_BUFFERs vs the real 6). Canary's `VdSwap_entry` writes the
|
||||
// block and returns; the swap-complete CP interrupt comes only from the
|
||||
// game's own in-stream `PM4_INTERRUPT` packets, never from VdSwap.
|
||||
if buffer_ptr != 0 {
|
||||
for i in 0..64u32 {
|
||||
mem.write_u32(buffer_ptr + i * 4, xenia_gpu::pm4::make_packet_type2());
|
||||
let mut off = 0u32;
|
||||
let mut put = |i: &mut u32, v: u32| {
|
||||
mem.write_u32(buffer_ptr + *i * 4, v);
|
||||
*i += 1;
|
||||
};
|
||||
// PM4_TYPE0 fetch-constant slot-0 patch (6 dwords payload). The
|
||||
// base_address field is patched to the physical frontbuffer so the
|
||||
// bloom/blur "sample frame N for frame N+1" path reads the right page.
|
||||
let mut patched = fetch_dwords;
|
||||
patched[1] = (patched[1] & 0x0000_0FFF) | ((frontbuffer_addr >> 12) << 12);
|
||||
put(
|
||||
&mut off,
|
||||
xenia_gpu::pm4::make_packet_type0(
|
||||
xenia_gpu::gpu_system::CONST_BASE_FETCH as u16,
|
||||
6,
|
||||
),
|
||||
);
|
||||
for d in patched {
|
||||
put(&mut off, d);
|
||||
}
|
||||
// PM4_TYPE3(PM4_XE_SWAP, 4 dwords): signature, frontbuffer_phys, w, h.
|
||||
put(
|
||||
&mut off,
|
||||
xenia_gpu::pm4::make_packet_type3(xenia_gpu::pm4::PM4_XE_SWAP, 4),
|
||||
);
|
||||
put(&mut off, xenia_gpu::pm4::SWAP_SIGNATURE);
|
||||
put(&mut off, frontbuffer_addr);
|
||||
put(&mut off, width);
|
||||
put(&mut off, height);
|
||||
// Pad the remainder with NOP (Type-2) packets.
|
||||
while off < 64 {
|
||||
put(&mut off, xenia_gpu::pm4::make_packet_type2());
|
||||
}
|
||||
}
|
||||
state.gpu.extend_write_ptr_by(64);
|
||||
// NOTE: We deliberately do NOT bump `CP_RB_WPTR` here (see the iterate-2V
|
||||
// comment above). The drain below consumes only the packets the game has
|
||||
// legitimately advanced the write-pointer over.
|
||||
|
||||
// GPUBUG-DRAIN-001: notify the swap directly.
|
||||
//
|
||||
// Per xenia-canary `VdSwap_entry` (xboxkrnl_video.cc:438-521), the
|
||||
// textbook approach is to inject `PM4_TYPE0(SHADER_CONSTANT_FETCH_00_0)`
|
||||
// (fetch-constant slot-0 patch for the Sylpheed bloom/blur "frame N+1"
|
||||
// sample) followed by `PM4_TYPE3(PM4_XE_SWAP)` directly into the
|
||||
// primary ring at WPTR, then let the natural drain consume them.
|
||||
//
|
||||
// That works in **pure lockstep** (drain runs at every kernel callback
|
||||
// boundary, ring has at most a few hundred packets pending). It
|
||||
// **does not** work under `--parallel` (CPU + GPU ring contention) —
|
||||
// observed empirically: vd_swap's `drain_to_current_wptr` consumes
|
||||
// 8-10 million game-batched IB packets in the 900 ms inline-deadline
|
||||
// window without reaching our tail-injected PM4_XE_SWAP. Under
|
||||
// threaded backend the worker has the same deadline. Either:
|
||||
// (a) the safety-net direct notify (below) fires and gets the swap
|
||||
// counted — but if the worker *eventually* drains past our
|
||||
// injected packet later it would double-count,
|
||||
// (b) we extend the deadline so far that vd_swap blocks for many
|
||||
// seconds — unreasonable for a kernel callback.
|
||||
//
|
||||
// Skip the ring injection unconditionally and post `notify_xe_swap`
|
||||
// directly. The drain still runs (game packets execute as normal).
|
||||
// **Trade-off**: the slot-0 fetch-constant patch is deferred —
|
||||
// tracked as GPUBUG-FETCH-PATCH-001. Sylpheed currently has draws=0,
|
||||
// so a stale slot 0 has no observable effect.
|
||||
// Drain the ring up to whatever the game has actually submitted; any
|
||||
// in-stream `PM4_INTERRUPT` / draw packets execute in order. The
|
||||
// reserved-slot PM4_XE_SWAP is consumed by the GPU only once the game
|
||||
// advances its own doorbell over it. The swap-counter safety net below
|
||||
// keeps host swap bookkeeping live in the meantime.
|
||||
let drained = state.gpu.drain_to_current_wptr(mem);
|
||||
tracing::debug!(drained, "VdSwap: drained PM4 packets");
|
||||
|
||||
// Direct swap notification. Inline mode bumps `swaps_seen`
|
||||
// synchronously; threaded mode posts a `GpuCommand::NotifyXeSwap`
|
||||
// and the worker bumps it asynchronously.
|
||||
// Safety net: if the drain did NOT reach our PM4_XE_SWAP this call (e.g.
|
||||
// an undersized inline deadline left game-batched packets pending), still
|
||||
// bump the host swap counter so the UI present + swap stats stay live.
|
||||
// Skip when the in-stream PM4_XE_SWAP already recorded this frontbuffer
|
||||
// (avoids double-counting). This path does NOT raise a CP interrupt.
|
||||
if frontbuffer_addr != 0 && width > 0 && height > 0 {
|
||||
let already_swapped = state
|
||||
.gpu
|
||||
.as_inline_mut()
|
||||
.map(|g| g.last_swap.map(|s| s.frontbuffer_phys) == Some(frontbuffer_addr))
|
||||
.unwrap_or(false);
|
||||
if !already_swapped {
|
||||
state.gpu.notify_xe_swap(frontbuffer_addr, width, height);
|
||||
}
|
||||
}
|
||||
|
||||
// The remaining vd_swap work (UI publish: shader blobs, constants,
|
||||
// texture cache, frontbuffer detile, ui.notify_swap) reads
|
||||
@@ -3080,16 +3189,17 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
);
|
||||
ui.publish_assets(blobs, constants);
|
||||
|
||||
// P5: try to decode the primary texture (fetch constant slot 0).
|
||||
// Slot 0 is the convention most games use for their main bound
|
||||
// texture at draw time; full N-slot binding waits for P6+. If the
|
||||
// slot is unset or the format isn't supported (magenta stub kicks
|
||||
// in host-side), we skip.
|
||||
//
|
||||
// Texture fetch constants live at `CONST_BASE_FETCH + slot*6` in
|
||||
// the register file; we read the 6 dwords, decode the key, hit
|
||||
// the CPU cache (with page-version freshness), and clone the
|
||||
// decoded bytes across the bridge.
|
||||
// P5b: publish the texture the last draw's *active pixel shader*
|
||||
// actually sampled. The GPU draw handler decodes the PS's real
|
||||
// `tfetch` fetch-constant slots into `last_draw_textures`; we publish
|
||||
// the first (the UI binds a single texture today). When the last draw
|
||||
// used a flat (no-tfetch) shader the list is empty, so we fall back to
|
||||
// the legacy slot-0 probe to preserve behavior on flat-only frames.
|
||||
let published = gpu_inline.last_draw_textures.first().cloned().or_else(|| {
|
||||
// Fallback: probe fetch constant slot 0 directly. Texture fetch
|
||||
// constants live at `CONST_BASE_FETCH + slot*6` in the register
|
||||
// file; read 6 dwords, decode the key, hit the CPU cache with
|
||||
// page-version freshness, clone the bytes across the bridge.
|
||||
const TEX_SLOT: u32 = 0;
|
||||
let mut fetch6 = [0u32; 6];
|
||||
for (i, slot) in fetch6.iter_mut().enumerate() {
|
||||
@@ -3097,10 +3207,9 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
.register_file
|
||||
.read(xenia_gpu::gpu_system::CONST_BASE_FETCH + TEX_SLOT * 6 + i as u32);
|
||||
}
|
||||
let published = if let Some(key) = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)
|
||||
{
|
||||
// Span over the entire tiled texture footprint to pick the
|
||||
// max page version covering it.
|
||||
let key = xenia_gpu::texture_cache::decode_fetch_constant(fetch6)?;
|
||||
// Span over the entire tiled texture footprint to pick the max
|
||||
// page version covering it.
|
||||
let bi = key.format.block_info();
|
||||
let span_bytes = (key.pitch_texels as u32)
|
||||
* (key.height as u32)
|
||||
@@ -3118,9 +3227,7 @@ fn vd_swap(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
|
||||
None
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
});
|
||||
metrics::gauge!("gpu.texture_cache.entries")
|
||||
.set(gpu_inline.texture_cache.len() as f64);
|
||||
ui.publish_texture(published);
|
||||
@@ -5547,6 +5654,67 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
/// `NtSetInformationFile` class 10 (`XFileRenameInformation`) must move
|
||||
/// the backing host file to the new `cache:` path and update the handle.
|
||||
/// Mirrors Sylpheed's asset-cache `.tmp` → `\<hash>\<dir>\<file>` move;
|
||||
/// without it the nested target stays empty and the decompressed asset
|
||||
/// (logo texture) never reads back. Faithful to canary `file->Rename`.
|
||||
#[test]
|
||||
fn nt_set_information_file_rename_moves_cache_file() {
|
||||
let (mut ctx, mut mem, mut state) = fresh();
|
||||
// Real temp cache root + a staging `.tmp` file with known bytes.
|
||||
let root = std::env::temp_dir().join(format!("xenia-rs-rename-test-{}", std::process::id()));
|
||||
let _ = std::fs::remove_dir_all(&root);
|
||||
std::fs::create_dir_all(&root).unwrap();
|
||||
let old_host = root.join("69d8e45ce534ffea.tmp");
|
||||
std::fs::write(&old_host, b"LOGOTEX!").unwrap();
|
||||
state.cache_root = Some(root.clone());
|
||||
// Open handle whose backing host_path is the staging file.
|
||||
let handle = state.alloc_handle_for(KernelObject::File {
|
||||
path: "69d8e45ce534ffea.tmp".to_string(),
|
||||
size: 8,
|
||||
position: 0,
|
||||
data: Arc::new(Vec::new()),
|
||||
dir_enum_pos: None,
|
||||
host_path: Some(old_host.clone()),
|
||||
});
|
||||
// X_FILE_RENAME_INFORMATION { replace@0, root_dir@4, ANSI_STRING@8 }.
|
||||
// ANSI_STRING { len u16, max u16, buf u32 } at info_ptr+8; buffer holds
|
||||
// the target path "cache:\69d8e45c\e\534ffea".
|
||||
let info_ptr = SCRATCH_BASE + 0x100;
|
||||
let str_buf = SCRATCH_BASE + 0x200;
|
||||
let target = b"cache:\\69d8e45c\\e\\534ffea";
|
||||
for (i, b) in target.iter().enumerate() {
|
||||
mem.write_u8(str_buf + i as u32, *b);
|
||||
}
|
||||
mem.write_u32(info_ptr, 0); // replace_existing
|
||||
mem.write_u32(info_ptr + 4, 0); // root_dir_handle
|
||||
mem.write_u16(info_ptr + 8, target.len() as u16); // ANSI_STRING.Length
|
||||
mem.write_u16(info_ptr + 10, target.len() as u16); // MaximumLength
|
||||
mem.write_u32(info_ptr + 12, str_buf); // Buffer
|
||||
let iosb_ptr = SCRATCH_BASE + 0x140;
|
||||
ctx.gpr[3] = handle as u64;
|
||||
ctx.gpr[4] = iosb_ptr as u64;
|
||||
ctx.gpr[5] = info_ptr as u64;
|
||||
ctx.gpr[6] = 16;
|
||||
ctx.gpr[7] = 10; // XFileRenameInformation
|
||||
nt_set_information_file(&mut ctx, &mut mem, &mut state);
|
||||
assert_eq!(ctx.gpr[3], STATUS_SUCCESS);
|
||||
// Staging file gone; nested target exists with the same bytes.
|
||||
let new_host = root.join("69d8e45c").join("e").join("534ffea");
|
||||
assert!(!old_host.exists(), "staging .tmp should be moved away");
|
||||
assert_eq!(std::fs::read(&new_host).unwrap(), b"LOGOTEX!");
|
||||
// Handle now points at the new host + guest path.
|
||||
match state.objects.get(&handle) {
|
||||
Some(KernelObject::File { host_path: Some(hp), path, .. }) => {
|
||||
assert_eq!(hp, &new_host);
|
||||
assert_eq!(path, "cache:/69d8e45c/e/534ffea");
|
||||
}
|
||||
_ => panic!("file handle lost or host_path missing"),
|
||||
}
|
||||
let _ = std::fs::remove_dir_all(&root);
|
||||
}
|
||||
|
||||
/// Read-only VFS — truncating to a different size must fail with
|
||||
/// `STATUS_UNSUCCESSFUL`, matching Canary's error path when
|
||||
/// `file->SetLength(...)` can't honour the request.
|
||||
|
||||
@@ -30,6 +30,12 @@ use xenia_cpu::ThreadRef;
|
||||
pub const INTERRUPT_SOURCE_VSYNC: u32 = 0;
|
||||
pub const INTERRUPT_SOURCE_CP: u32 = 1;
|
||||
|
||||
/// The processor the graphics ISR impersonates for a v-sync interrupt.
|
||||
/// Canary hard-codes this: `MarkVblank` → `DispatchInterruptCallback(0, 2)`
|
||||
/// (graphics_system.cc:478). CP interrupts instead use the bit index of the
|
||||
/// `PM4_INTERRUPT` `cpu_mask`.
|
||||
pub const VSYNC_TARGET_CPU: u8 = 2;
|
||||
|
||||
/// Guest-registered V-sync / graphics-interrupt callback (from
|
||||
/// `VdSetGraphicsInterruptCallback`).
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -145,9 +151,16 @@ pub type PendingLocalIrq = [std::sync::atomic::AtomicU8;
|
||||
pub struct InterruptState {
|
||||
/// Registered callback (set by `VdSetGraphicsInterruptCallback`).
|
||||
pub callback: Option<GraphicsInterruptCallback>,
|
||||
/// Bounded FIFO of pending interrupt sources awaiting injection.
|
||||
/// Push-back on queue, pop-front on inject. Over-cap pushes drop.
|
||||
pub pending: VecDeque<u32>,
|
||||
/// Bounded FIFO of pending interrupts awaiting injection, as
|
||||
/// `(source, target_cpu)`. Push-back on queue, pop-front on inject.
|
||||
/// Over-cap pushes drop. `target_cpu` is the processor the graphics
|
||||
/// ISR must impersonate (canary `XThread::SetActiveCpu` / the
|
||||
/// `DispatchInterruptCallback(source, cpu)` argument): the bit index
|
||||
/// of the CP `PM4_INTERRUPT` `cpu_mask` for source=1, and a fixed `2`
|
||||
/// for vsync (canary `DispatchInterruptCallback(0, 2)`). The ISR reads
|
||||
/// it from the PCR (`[r13+268]`) to clear the matching per-CPU bit of
|
||||
/// the swap-acknowledge fence.
|
||||
pub pending: VecDeque<(u32, u8)>,
|
||||
/// When `Some`, some HW thread is currently running a callback; on
|
||||
/// return-to-sentinel we restore this and clear the flag.
|
||||
pub saved: Option<SavedCallbackCtx>,
|
||||
@@ -211,8 +224,9 @@ impl InterruptState {
|
||||
});
|
||||
}
|
||||
|
||||
/// Queue an interrupt for the next safe injection point.
|
||||
pub fn queue_interrupt(&mut self, source: u32) {
|
||||
/// Queue an interrupt for the next safe injection point. `cpu` is the
|
||||
/// processor the ISR must impersonate (see `pending`).
|
||||
pub fn queue_interrupt(&mut self, source: u32, cpu: u8) {
|
||||
if self.callback.is_none() {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
@@ -221,18 +235,23 @@ impl InterruptState {
|
||||
self.dropped += 1;
|
||||
return;
|
||||
}
|
||||
self.pending.push_back(source);
|
||||
self.pending.push_back((source, cpu));
|
||||
}
|
||||
|
||||
/// Peek at the next pending source without removing it.
|
||||
pub fn peek_next(&self) -> Option<u32> {
|
||||
self.pending.front().copied()
|
||||
self.pending.front().map(|&(source, _)| source)
|
||||
}
|
||||
|
||||
/// Peek at the target CPU of the next pending interrupt.
|
||||
pub fn peek_next_cpu(&self) -> Option<u8> {
|
||||
self.pending.front().map(|&(_, cpu)| cpu)
|
||||
}
|
||||
|
||||
/// Pop the next pending source (called by the injector after it has
|
||||
/// committed to dispatching it).
|
||||
pub fn take_next(&mut self) -> Option<u32> {
|
||||
self.pending.pop_front()
|
||||
self.pending.pop_front().map(|(source, _)| source)
|
||||
}
|
||||
|
||||
/// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
|
||||
@@ -249,7 +268,7 @@ impl InterruptState {
|
||||
let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
|
||||
self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
|
||||
for _ in 0..periods {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -288,7 +307,7 @@ impl InterruptState {
|
||||
self.last_vsync_instant = Some(anchor + advance);
|
||||
let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP);
|
||||
for _ in 0..to_queue {
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
true
|
||||
}
|
||||
@@ -306,7 +325,7 @@ mod tests {
|
||||
#[test]
|
||||
fn queue_interrupt_drops_without_callback() {
|
||||
let mut s = InterruptState::default();
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 1);
|
||||
assert!(s.pending.is_empty());
|
||||
}
|
||||
@@ -315,9 +334,9 @@ mod tests {
|
||||
fn queue_interrupt_fifo_preserves_order() {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_CP);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_CP, 2);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 0);
|
||||
// FIFO: take_next hands them out in push order.
|
||||
assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
|
||||
@@ -331,11 +350,11 @@ mod tests {
|
||||
let mut s = InterruptState::default();
|
||||
s.set_callback(0x1000, 0xAB);
|
||||
for _ in 0..INTERRUPT_QUEUE_CAP {
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
}
|
||||
// Over-cap: drops rather than evicting the oldest.
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
|
||||
assert_eq!(s.dropped, 2);
|
||||
assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use xenia_memory::{GuestMemory, MemoryAccess};
|
||||
/// u16 Length
|
||||
/// u16 MaximumLength
|
||||
/// u32 Buffer (guest pointer)
|
||||
fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
|
||||
pub fn read_ansi_string(mem: &GuestMemory, ptr: u32) -> Option<String> {
|
||||
if ptr == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
@@ -17,6 +17,16 @@ impl PcrWriter for GuestMemoryPcr<'_> {
|
||||
// `GuestMemory::write_u32` takes `&self` post-M2 trait flip; the
|
||||
// wrapping `&'a GuestMemory` is sufficient.
|
||||
self.0.write_u32(pcr_base + 0x2C, hw_id as u32);
|
||||
// PRCB.current_cpu byte at PCR+0x10C (prcb_data@0x100 + current_cpu@0xC).
|
||||
// Canary writes `GetFakeCpuNumber(affinity)` here (xthread.cc:847
|
||||
// `pcr->prcb_data.current_cpu = cpu_index`), which equals the HW thread
|
||||
// id we already compute. Guest spin-barriers (e.g. sub_824D1328, used by
|
||||
// the audio/update pump threads at entries 0x824D2878/0x824D2940) index a
|
||||
// per-HW-thread occupancy array by `lbz r11, 268(r13)` = this byte. Left
|
||||
// unwritten it stayed 0 for every thread, so all threads collided on
|
||||
// slot 0 and the multi-thread rendezvous signature never assembled —
|
||||
// the pump threads spun forever and never fired their KeSetEvent loops.
|
||||
self.0.write_u8(pcr_base + 0x10C, hw_id);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -57,6 +57,11 @@ pub fn allocate_thread_image(
|
||||
mem.write_u32(pcr_base, tls_base);
|
||||
mem.write_u32(pcr_base + 0x2C, hw_thread_id as u32);
|
||||
mem.write_u32(pcr_base + 0x100, 0x1000);
|
||||
// +0x10C prcb_data.current_cpu — canary `pcr->prcb_data.current_cpu`
|
||||
// (PRCB@0x100 + current_cpu@0xC). Guest spin-barriers index a
|
||||
// per-HW-thread slot array by `lbz r11, 268(r13)` = this byte; it
|
||||
// must equal the HW thread id (== PCR+0x2C). See state.rs PcrWriter.
|
||||
mem.write_u8(pcr_base + 0x10C, hw_thread_id);
|
||||
mem.write_u32(pcr_base + 0x150, 0);
|
||||
|
||||
Some(ThreadImage {
|
||||
|
||||
Reference in New Issue
Block a user