From a91f4c550b1099d835577e17a596dffcacdf7ef6 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 14 Jun 2026 20:49:32 +0200 Subject: [PATCH] [iterate-2W] Sustain the title present loop: viewport-size register + ISR CPU impersonation The title's per-frame loop (sub_822F1AA8) is clock-B-paced and only re-fires when the swap count [controller+88] changes, which advances only on source=1 CP swap-complete interrupts. Each present batch the guest submits (via the sub_824CE348 -> sub_824BF4D0 builder) ends with a WAIT_REG_MEM on a per-CPU swap-acknowledge fence [GCTX+0] (GCTX = [device+10772]); the GPU parks there until the graphics ISR (sub_824BE9A0) clears that CPU's bit. Two coupled gaps kept ours emitting only ONE source=1 then dead-locking (draws plateaued at 28, run halted ~19.27M): 1. GPU MMIO register 0x1961 (AVIVO_D1MODE_VIEWPORT_SIZE) read as 0. The swap callback sub_824CE2B8 divides by its low 12 bits (display height) as a refresh-pacing term, so a 0 read tripped its `twi` divide-by-zero guard and aborted the ISR before it reached the fence-clear. Mirror canary GraphicsSystem::ReadRegister (graphics_system.cc:311): return 0x050002D0 (1280x720). 2. The ISR ran on an arbitrary borrowed thread, so [r13+268] (the PCR processor number) did not match the interrupt's target CPU. The ISR clears `1 << current_cpu` from the fence; running on the wrong CPU cleared the wrong bit and the fence (bit 2, from cpu_mask 0x4) never reached 0. Carry the target CPU through the interrupt queue (bit index of the PM4_INTERRUPT cpu_mask for CP, 2 for vsync per canary DispatchInterruptCallback(0, 2)) and impersonate it on the borrowed thread's PCR around the ISR, mirroring canary EmulateCPInterruptDPC -> XThread::SetActiveCpu. With both fixes the fence clears, the GPU drains each present batch, source=1 sustains per-present, clock B advances, and the loop runs continuously. Draws climb linearly with the budget (no re-stall): 50M 28->718, 200M ->3411, 1B ->18734; swaps 2->147/950/6060. No "Unanticipated CPU_INTERRUPT" trap. Inline-deterministic (--stable-digest byte-identical x2); n50m golden re-baselined. 675 tests green. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/xenia-app/src/main.rs | 40 +++++++++++++- .../xenia-app/tests/golden/sylpheed_n50m.json | 10 ++-- crates/xenia-gpu/src/gpu_system.rs | 9 ++++ crates/xenia-gpu/src/mmio_region.rs | 9 ++++ crates/xenia-kernel/src/interrupts.rs | 53 +++++++++++++------ 5 files changed, 97 insertions(+), 24 deletions(-) diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs index d3009e9..78f62c6 100644 --- a/crates/xenia-app/src/main.rs +++ b/crates/xenia-app/src/main.rs @@ -2338,10 +2338,22 @@ fn coord_post_round( } if kernel.gpu.has_pending_interrupts() { - for _pi in kernel.gpu.take_pending_interrupts() { + for pi in kernel.gpu.take_pending_interrupts() { + // Canary `ExecutePacketType3_INTERRUPT` dispatches the callback + // once per set bit of `cpu_mask` with that bit's index as the + // target CPU (`DispatchInterruptCallback(1, n)`). The guest's + // swap-acknowledge fence stores `cpu_mask`, and the ISR clears + // `1 << current_cpu` from it — so the ISR must run impersonating + // the masked CPU or the fence never reaches 0. Sylpheed uses a + // single-bit mask (`0x4` → CPU 2); take the lowest set bit. + let cpu = if pi.cpu_mask == 0 { + xenia_kernel::interrupts::VSYNC_TARGET_CPU + } else { + pi.cpu_mask.trailing_zeros().min(5) as u8 + }; kernel .interrupts - .queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP); + .queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP, cpu); } } @@ -3545,7 +3557,17 @@ fn dispatch_graphics_interrupts( None }; + /// X_KPCR offset of `prcb_data.current_cpu` (canary `xthread.cc` + /// `SetActiveCpu` → `pcr.prcb_data.current_cpu`). The guest graphics + /// ISR reads it via `lbz r10, 268(r13)` to decide which per-CPU bit of + /// the swap-acknowledge fence to clear. + const PCR_CURRENT_CPU_OFF: u32 = 268; + while let Some(source) = kernel.interrupts.peek_next() { + let target_cpu = kernel + .interrupts + .peek_next_cpu() + .unwrap_or(xenia_kernel::interrupts::VSYNC_TARGET_CPU); // Victim selection: Ready first, then Blocked (canary's // `XThread::GetCurrentThread()` analog — any live thread will // do for borrowing context). Skip Idle/Exited/ServicingIrq. @@ -3615,6 +3637,19 @@ fn dispatch_graphics_interrupts( saved }; + // Impersonate the interrupt's target CPU on the borrowed thread's + // PCR, mirroring canary `EmulateCPInterruptDPC` → + // `XThread::SetActiveCpu(cpu)`. The guest swap-complete ISR clears + // `1 << [pcr.current_cpu]` from the per-present swap-acknowledge + // fence; if it runs on the wrong CPU it clears the wrong bit and + // the GPU's trailing `WAIT_REG_MEM` on that fence never releases — + // stranding the present/title loop. Save/restore so borrowing a + // thread doesn't permanently rewrite its processor number. + let pcr_addr = (kernel.scheduler.ctx_mut_ref(target_ref).gpr[13] as u32) + .wrapping_add(PCR_CURRENT_CPU_OFF); + let saved_cpu = mem.read_u8(pcr_addr); + mem.write_u8(pcr_addr, target_cpu); + // Stash the previous `scheduler.current` (call_export reaches // it; imports the ISR calls must dispatch on the borrowed // thread). Restore on the way out. @@ -3707,6 +3742,7 @@ fn dispatch_graphics_interrupts( // Restore the borrowed context. saved.restore(kernel.scheduler.ctx_mut_ref(target_ref)); + mem.write_u8(pcr_addr, saved_cpu); kernel.scheduler.current = prev_current; kernel.interrupts.delivered += 1; diff --git a/crates/xenia-app/tests/golden/sylpheed_n50m.json b/crates/xenia-app/tests/golden/sylpheed_n50m.json index 0c2f40e..457bac7 100644 --- a/crates/xenia-app/tests/golden/sylpheed_n50m.json +++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json @@ -1,10 +1,10 @@ { - "instructions": 19274336, - "imports": 72513, + "instructions": 50000014, + "imports": 352251, "unimpl": 0, - "draws": 28, - "swaps": 2, + "draws": 718, + "swaps": 147, "unique_render_targets": 2, - "shader_blobs_live": 3, + "shader_blobs_live": 6, "texture_cache_entries": 0 } diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs index 18b0e70..e3b77e7 100644 --- a/crates/xenia-gpu/src/gpu_system.rs +++ b/crates/xenia-gpu/src/gpu_system.rs @@ -1544,6 +1544,15 @@ pub mod reg { /// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126). /// Bit 0 = VBLANK_INT_OCCURRED. pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951; + /// `XE_GPU_REG_D1MODE_VIEWPORT_SIZE` / `AVIVO_D1MODE_VIEWPORT_SIZE` + /// (Canary `register_table.inc:1134`). Packs the active display resolution + /// as `(width << 16) | height` with 12-bit fields. The guest's + /// swap-complete interrupt callback (`sub_824CE2B8`) divides by the low + /// 12 bits (`height`) as a refresh-pacing term, so a 0 read makes its + /// `twi` divide-by-zero guard trap and abort the ISR before it clears the + /// swap-acknowledge fence. Canary returns the constant below from + /// `GraphicsSystem::ReadRegister` (graphics_system.cc:311). + pub const D1MODE_VIEWPORT_SIZE: u32 = 0x1961; /// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE. pub const VGT_EVENT_INITIATOR: u32 = 0x21F9; /// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits diff --git a/crates/xenia-gpu/src/mmio_region.rs b/crates/xenia-gpu/src/mmio_region.rs index fe32c62..7d3e76b 100644 --- a/crates/xenia-gpu/src/mmio_region.rs +++ b/crates/xenia-gpu/src/mmio_region.rs @@ -58,6 +58,15 @@ pub fn build_region(mmio: &GpuMmio) -> MmioRegion { reg::D1MODE_VBLANK_VLINE_STATUS => { read_vblank_status.load(Ordering::Relaxed) } + // AVIVO_D1MODE_VIEWPORT_SIZE: the active display resolution + // (1280x720) packed as `(width << 16) | height`. Canary + // serves this constant from `GraphicsSystem::ReadRegister` + // (graphics_system.cc:311). The guest swap-complete interrupt + // callback divides by the low 12 bits (`height = 0x2D0`); a 0 + // read trips its `twi` divide-guard and aborts the ISR before + // it acknowledges the per-present swap fence — which strands + // the present/title loop. Mirror canary exactly. + reg::D1MODE_VIEWPORT_SIZE => 0x0500_02D0, _ => { tracing::trace!( reg = format_args!("{reg_index:#x}"), diff --git a/crates/xenia-kernel/src/interrupts.rs b/crates/xenia-kernel/src/interrupts.rs index e336834..aa7cbf7 100644 --- a/crates/xenia-kernel/src/interrupts.rs +++ b/crates/xenia-kernel/src/interrupts.rs @@ -30,6 +30,12 @@ use xenia_cpu::ThreadRef; pub const INTERRUPT_SOURCE_VSYNC: u32 = 0; pub const INTERRUPT_SOURCE_CP: u32 = 1; +/// The processor the graphics ISR impersonates for a v-sync interrupt. +/// Canary hard-codes this: `MarkVblank` → `DispatchInterruptCallback(0, 2)` +/// (graphics_system.cc:478). CP interrupts instead use the bit index of the +/// `PM4_INTERRUPT` `cpu_mask`. +pub const VSYNC_TARGET_CPU: u8 = 2; + /// Guest-registered V-sync / graphics-interrupt callback (from /// `VdSetGraphicsInterruptCallback`). #[derive(Debug, Clone, Copy)] @@ -145,9 +151,16 @@ pub type PendingLocalIrq = [std::sync::atomic::AtomicU8; pub struct InterruptState { /// Registered callback (set by `VdSetGraphicsInterruptCallback`). pub callback: Option, - /// Bounded FIFO of pending interrupt sources awaiting injection. - /// Push-back on queue, pop-front on inject. Over-cap pushes drop. - pub pending: VecDeque, + /// Bounded FIFO of pending interrupts awaiting injection, as + /// `(source, target_cpu)`. Push-back on queue, pop-front on inject. + /// Over-cap pushes drop. `target_cpu` is the processor the graphics + /// ISR must impersonate (canary `XThread::SetActiveCpu` / the + /// `DispatchInterruptCallback(source, cpu)` argument): the bit index + /// of the CP `PM4_INTERRUPT` `cpu_mask` for source=1, and a fixed `2` + /// for vsync (canary `DispatchInterruptCallback(0, 2)`). The ISR reads + /// it from the PCR (`[r13+268]`) to clear the matching per-CPU bit of + /// the swap-acknowledge fence. + pub pending: VecDeque<(u32, u8)>, /// When `Some`, some HW thread is currently running a callback; on /// return-to-sentinel we restore this and clear the flag. pub saved: Option, @@ -211,8 +224,9 @@ impl InterruptState { }); } - /// Queue an interrupt for the next safe injection point. - pub fn queue_interrupt(&mut self, source: u32) { + /// Queue an interrupt for the next safe injection point. `cpu` is the + /// processor the ISR must impersonate (see `pending`). + pub fn queue_interrupt(&mut self, source: u32, cpu: u8) { if self.callback.is_none() { self.dropped += 1; return; @@ -221,18 +235,23 @@ impl InterruptState { self.dropped += 1; return; } - self.pending.push_back(source); + self.pending.push_back((source, cpu)); } /// Peek at the next pending source without removing it. pub fn peek_next(&self) -> Option { - self.pending.front().copied() + self.pending.front().map(|&(source, _)| source) + } + + /// Peek at the target CPU of the next pending interrupt. + pub fn peek_next_cpu(&self) -> Option { + self.pending.front().map(|&(_, cpu)| cpu) } /// Pop the next pending source (called by the injector after it has /// committed to dispatching it). pub fn take_next(&mut self) -> Option { - self.pending.pop_front() + self.pending.pop_front().map(|(source, _)| source) } /// **Legacy** — instruction-count v-sync ticker. Kept for unit tests @@ -249,7 +268,7 @@ impl InterruptState { let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD; self.vsync_accumulator %= VSYNC_INSTR_PERIOD; for _ in 0..periods { - self.queue_interrupt(INTERRUPT_SOURCE_VSYNC); + self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); } true } @@ -288,7 +307,7 @@ impl InterruptState { self.last_vsync_instant = Some(anchor + advance); let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP); for _ in 0..to_queue { - self.queue_interrupt(INTERRUPT_SOURCE_VSYNC); + self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); } true } @@ -306,7 +325,7 @@ mod tests { #[test] fn queue_interrupt_drops_without_callback() { let mut s = InterruptState::default(); - s.queue_interrupt(INTERRUPT_SOURCE_VSYNC); + s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); assert_eq!(s.dropped, 1); assert!(s.pending.is_empty()); } @@ -315,9 +334,9 @@ mod tests { fn queue_interrupt_fifo_preserves_order() { let mut s = InterruptState::default(); s.set_callback(0x1000, 0xAB); - s.queue_interrupt(INTERRUPT_SOURCE_VSYNC); - s.queue_interrupt(INTERRUPT_SOURCE_CP); - s.queue_interrupt(INTERRUPT_SOURCE_VSYNC); + s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); + s.queue_interrupt(INTERRUPT_SOURCE_CP, 2); + s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); assert_eq!(s.dropped, 0); // FIFO: take_next hands them out in push order. assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC)); @@ -331,11 +350,11 @@ mod tests { let mut s = InterruptState::default(); s.set_callback(0x1000, 0xAB); for _ in 0..INTERRUPT_QUEUE_CAP { - s.queue_interrupt(INTERRUPT_SOURCE_VSYNC); + s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); } // Over-cap: drops rather than evicting the oldest. - s.queue_interrupt(INTERRUPT_SOURCE_VSYNC); - s.queue_interrupt(INTERRUPT_SOURCE_VSYNC); + s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); + s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU); assert_eq!(s.dropped, 2); assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP); }