From a91f4c550b1099d835577e17a596dffcacdf7ef6 Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Sun, 14 Jun 2026 20:49:32 +0200
Subject: [PATCH] [iterate-2W] Sustain the title present loop: viewport-size
 register + ISR CPU impersonation

The title's per-frame loop (sub_822F1AA8) is clock-B-paced and only re-fires
when the swap count [controller+88] changes, which advances only on source=1
CP swap-complete interrupts. Each present batch the guest submits (via the
sub_824CE348 -> sub_824BF4D0 builder) ends with a WAIT_REG_MEM on a per-CPU
swap-acknowledge fence [GCTX+0] (GCTX = [device+10772]); the GPU parks there
until the graphics ISR (sub_824BE9A0) clears that CPU's bit. Two coupled gaps
kept ours emitting only ONE source=1 then dead-locking (draws plateaued at 28,
run halted ~19.27M):

1. GPU MMIO register 0x1961 (AVIVO_D1MODE_VIEWPORT_SIZE) read as 0. The swap
   callback sub_824CE2B8 divides by its low 12 bits (display height) as a
   refresh-pacing term, so a 0 read tripped its `twi` divide-by-zero guard and
   aborted the ISR before it reached the fence-clear. Mirror canary
   GraphicsSystem::ReadRegister (graphics_system.cc:311): return 0x050002D0
   (1280x720).

2. The ISR ran on an arbitrary borrowed thread, so [r13+268] (the PCR
   processor number) did not match the interrupt's target CPU. The ISR clears
   `1 << current_cpu` from the fence; running on the wrong CPU cleared the
   wrong bit and the fence (bit 2, from cpu_mask 0x4) never reached 0. Carry
   the target CPU through the interrupt queue (bit index of the PM4_INTERRUPT
   cpu_mask for CP, 2 for vsync per canary DispatchInterruptCallback(0, 2)) and
   impersonate it on the borrowed thread's PCR around the ISR, mirroring canary
   EmulateCPInterruptDPC -> XThread::SetActiveCpu.

With both fixes the fence clears, the GPU drains each present batch, source=1
sustains per-present, clock B advances, and the loop runs continuously. Draws
climb linearly with the budget (no re-stall): 50M 28->718, 200M ->3411,
1B ->18734; swaps 2->147/950/6060. No "Unanticipated CPU_INTERRUPT" trap.
Inline-deterministic (--stable-digest byte-identical x2); n50m golden
re-baselined. 675 tests green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/xenia-app/src/main.rs                  | 40 +++++++++++++-
 .../xenia-app/tests/golden/sylpheed_n50m.json | 10 ++--
 crates/xenia-gpu/src/gpu_system.rs            |  9 ++++
 crates/xenia-gpu/src/mmio_region.rs           |  9 ++++
 crates/xenia-kernel/src/interrupts.rs         | 53 +++++++++++++------
 5 files changed, 97 insertions(+), 24 deletions(-)

diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs
index d3009e9..78f62c6 100644
--- a/crates/xenia-app/src/main.rs
+++ b/crates/xenia-app/src/main.rs
@@ -2338,10 +2338,22 @@ fn coord_post_round(
     }
 
     if kernel.gpu.has_pending_interrupts() {
-        for _pi in kernel.gpu.take_pending_interrupts() {
+        for pi in kernel.gpu.take_pending_interrupts() {
+            // Canary `ExecutePacketType3_INTERRUPT` dispatches the callback
+            // once per set bit of `cpu_mask` with that bit's index as the
+            // target CPU (`DispatchInterruptCallback(1, n)`). The guest's
+            // swap-acknowledge fence stores `cpu_mask`, and the ISR clears
+            // `1 << current_cpu` from it — so the ISR must run impersonating
+            // the masked CPU or the fence never reaches 0. Sylpheed uses a
+            // single-bit mask (`0x4` → CPU 2); take the lowest set bit.
+            let cpu = if pi.cpu_mask == 0 {
+                xenia_kernel::interrupts::VSYNC_TARGET_CPU
+            } else {
+                pi.cpu_mask.trailing_zeros().min(5) as u8
+            };
             kernel
                 .interrupts
-                .queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP);
+                .queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP, cpu);
         }
     }
 
@@ -3545,7 +3557,17 @@ fn dispatch_graphics_interrupts(
         None
     };
 
+    /// X_KPCR offset of `prcb_data.current_cpu` (canary `xthread.cc`
+    /// `SetActiveCpu` → `pcr.prcb_data.current_cpu`). The guest graphics
+    /// ISR reads it via `lbz r10, 268(r13)` to decide which per-CPU bit of
+    /// the swap-acknowledge fence to clear.
+    const PCR_CURRENT_CPU_OFF: u32 = 268;
+
     while let Some(source) = kernel.interrupts.peek_next() {
+        let target_cpu = kernel
+            .interrupts
+            .peek_next_cpu()
+            .unwrap_or(xenia_kernel::interrupts::VSYNC_TARGET_CPU);
         // Victim selection: Ready first, then Blocked (canary's
         // `XThread::GetCurrentThread()` analog — any live thread will
         // do for borrowing context). Skip Idle/Exited/ServicingIrq.
@@ -3615,6 +3637,19 @@ fn dispatch_graphics_interrupts(
             saved
         };
 
+        // Impersonate the interrupt's target CPU on the borrowed thread's
+        // PCR, mirroring canary `EmulateCPInterruptDPC` →
+        // `XThread::SetActiveCpu(cpu)`. The guest swap-complete ISR clears
+        // `1 << [pcr.current_cpu]` from the per-present swap-acknowledge
+        // fence; if it runs on the wrong CPU it clears the wrong bit and
+        // the GPU's trailing `WAIT_REG_MEM` on that fence never releases —
+        // stranding the present/title loop. Save/restore so borrowing a
+        // thread doesn't permanently rewrite its processor number.
+        let pcr_addr = (kernel.scheduler.ctx_mut_ref(target_ref).gpr[13] as u32)
+            .wrapping_add(PCR_CURRENT_CPU_OFF);
+        let saved_cpu = mem.read_u8(pcr_addr);
+        mem.write_u8(pcr_addr, target_cpu);
+
         // Stash the previous `scheduler.current` (call_export reaches
         // it; imports the ISR calls must dispatch on the borrowed
         // thread). Restore on the way out.
@@ -3707,6 +3742,7 @@ fn dispatch_graphics_interrupts(
 
         // Restore the borrowed context.
         saved.restore(kernel.scheduler.ctx_mut_ref(target_ref));
+        mem.write_u8(pcr_addr, saved_cpu);
         kernel.scheduler.current = prev_current;
         kernel.interrupts.delivered += 1;
 
diff --git a/crates/xenia-app/tests/golden/sylpheed_n50m.json b/crates/xenia-app/tests/golden/sylpheed_n50m.json
index 0c2f40e..457bac7 100644
--- a/crates/xenia-app/tests/golden/sylpheed_n50m.json
+++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json
@@ -1,10 +1,10 @@
 {
-  "instructions": 19274336,
-  "imports": 72513,
+  "instructions": 50000014,
+  "imports": 352251,
   "unimpl": 0,
-  "draws": 28,
-  "swaps": 2,
+  "draws": 718,
+  "swaps": 147,
   "unique_render_targets": 2,
-  "shader_blobs_live": 3,
+  "shader_blobs_live": 6,
   "texture_cache_entries": 0
 }
diff --git a/crates/xenia-gpu/src/gpu_system.rs b/crates/xenia-gpu/src/gpu_system.rs
index 18b0e70..e3b77e7 100644
--- a/crates/xenia-gpu/src/gpu_system.rs
+++ b/crates/xenia-gpu/src/gpu_system.rs
@@ -1544,6 +1544,15 @@ pub mod reg {
     /// `XE_GPU_REG_D1MODE_VBLANK_VLINE_STATUS` (Canary register_table.inc:1126).
     /// Bit 0 = VBLANK_INT_OCCURRED.
     pub const D1MODE_VBLANK_VLINE_STATUS: u32 = 0x1951;
+    /// `XE_GPU_REG_D1MODE_VIEWPORT_SIZE` / `AVIVO_D1MODE_VIEWPORT_SIZE`
+    /// (Canary `register_table.inc:1134`). Packs the active display resolution
+    /// as `(width << 16) | height` with 12-bit fields. The guest's
+    /// swap-complete interrupt callback (`sub_824CE2B8`) divides by the low
+    /// 12 bits (`height`) as a refresh-pacing term, so a 0 read makes its
+    /// `twi` divide-by-zero guard trap and abort the ISR before it clears the
+    /// swap-acknowledge fence. Canary returns the constant below from
+    /// `GraphicsSystem::ReadRegister` (graphics_system.cc:311).
+    pub const D1MODE_VIEWPORT_SIZE: u32 = 0x1961;
     /// `XE_GPU_REG_VGT_EVENT_INITIATOR` — set by EVENT_WRITE.
     pub const VGT_EVENT_INITIATOR: u32 = 0x21F9;
     /// `XE_GPU_REG_COHER_STATUS_HOST` — coherency bits
diff --git a/crates/xenia-gpu/src/mmio_region.rs b/crates/xenia-gpu/src/mmio_region.rs
index fe32c62..7d3e76b 100644
--- a/crates/xenia-gpu/src/mmio_region.rs
+++ b/crates/xenia-gpu/src/mmio_region.rs
@@ -58,6 +58,15 @@ pub fn build_region(mmio: &GpuMmio) -> MmioRegion {
                 reg::D1MODE_VBLANK_VLINE_STATUS => {
                     read_vblank_status.load(Ordering::Relaxed)
                 }
+                // AVIVO_D1MODE_VIEWPORT_SIZE: the active display resolution
+                // (1280x720) packed as `(width << 16) | height`. Canary
+                // serves this constant from `GraphicsSystem::ReadRegister`
+                // (graphics_system.cc:311). The guest swap-complete interrupt
+                // callback divides by the low 12 bits (`height = 0x2D0`); a 0
+                // read trips its `twi` divide-guard and aborts the ISR before
+                // it acknowledges the per-present swap fence — which strands
+                // the present/title loop. Mirror canary exactly.
+                reg::D1MODE_VIEWPORT_SIZE => 0x0500_02D0,
                 _ => {
                     tracing::trace!(
                         reg = format_args!("{reg_index:#x}"),
diff --git a/crates/xenia-kernel/src/interrupts.rs b/crates/xenia-kernel/src/interrupts.rs
index e336834..aa7cbf7 100644
--- a/crates/xenia-kernel/src/interrupts.rs
+++ b/crates/xenia-kernel/src/interrupts.rs
@@ -30,6 +30,12 @@ use xenia_cpu::ThreadRef;
 pub const INTERRUPT_SOURCE_VSYNC: u32 = 0;
 pub const INTERRUPT_SOURCE_CP: u32 = 1;
 
+/// The processor the graphics ISR impersonates for a v-sync interrupt.
+/// Canary hard-codes this: `MarkVblank` → `DispatchInterruptCallback(0, 2)`
+/// (graphics_system.cc:478). CP interrupts instead use the bit index of the
+/// `PM4_INTERRUPT` `cpu_mask`.
+pub const VSYNC_TARGET_CPU: u8 = 2;
+
 /// Guest-registered V-sync / graphics-interrupt callback (from
 /// `VdSetGraphicsInterruptCallback`).
 #[derive(Debug, Clone, Copy)]
@@ -145,9 +151,16 @@ pub type PendingLocalIrq = [std::sync::atomic::AtomicU8;
 pub struct InterruptState {
     /// Registered callback (set by `VdSetGraphicsInterruptCallback`).
     pub callback: Option<GraphicsInterruptCallback>,
-    /// Bounded FIFO of pending interrupt sources awaiting injection.
-    /// Push-back on queue, pop-front on inject. Over-cap pushes drop.
-    pub pending: VecDeque<u32>,
+    /// Bounded FIFO of pending interrupts awaiting injection, as
+    /// `(source, target_cpu)`. Push-back on queue, pop-front on inject.
+    /// Over-cap pushes drop. `target_cpu` is the processor the graphics
+    /// ISR must impersonate (canary `XThread::SetActiveCpu` / the
+    /// `DispatchInterruptCallback(source, cpu)` argument): the bit index
+    /// of the CP `PM4_INTERRUPT` `cpu_mask` for source=1, and a fixed `2`
+    /// for vsync (canary `DispatchInterruptCallback(0, 2)`). The ISR reads
+    /// it from the PCR (`[r13+268]`) to clear the matching per-CPU bit of
+    /// the swap-acknowledge fence.
+    pub pending: VecDeque<(u32, u8)>,
     /// When `Some`, some HW thread is currently running a callback; on
     /// return-to-sentinel we restore this and clear the flag.
     pub saved: Option<SavedCallbackCtx>,
@@ -211,8 +224,9 @@ impl InterruptState {
         });
     }
 
-    /// Queue an interrupt for the next safe injection point.
-    pub fn queue_interrupt(&mut self, source: u32) {
+    /// Queue an interrupt for the next safe injection point. `cpu` is the
+    /// processor the ISR must impersonate (see `pending`).
+    pub fn queue_interrupt(&mut self, source: u32, cpu: u8) {
         if self.callback.is_none() {
             self.dropped += 1;
             return;
@@ -221,18 +235,23 @@ impl InterruptState {
             self.dropped += 1;
             return;
         }
-        self.pending.push_back(source);
+        self.pending.push_back((source, cpu));
     }
 
     /// Peek at the next pending source without removing it.
     pub fn peek_next(&self) -> Option<u32> {
-        self.pending.front().copied()
+        self.pending.front().map(|&(source, _)| source)
+    }
+
+    /// Peek at the target CPU of the next pending interrupt.
+    pub fn peek_next_cpu(&self) -> Option<u8> {
+        self.pending.front().map(|&(_, cpu)| cpu)
     }
 
     /// Pop the next pending source (called by the injector after it has
     /// committed to dispatching it).
     pub fn take_next(&mut self) -> Option<u32> {
-        self.pending.pop_front()
+        self.pending.pop_front().map(|(source, _)| source)
     }
 
     /// **Legacy** — instruction-count v-sync ticker. Kept for unit tests
@@ -249,7 +268,7 @@ impl InterruptState {
         let periods = self.vsync_accumulator / VSYNC_INSTR_PERIOD;
         self.vsync_accumulator %= VSYNC_INSTR_PERIOD;
         for _ in 0..periods {
-            self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+            self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
         }
         true
     }
@@ -288,7 +307,7 @@ impl InterruptState {
         self.last_vsync_instant = Some(anchor + advance);
         let to_queue = (periods as usize).min(INTERRUPT_QUEUE_CAP);
         for _ in 0..to_queue {
-            self.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+            self.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
         }
         true
     }
@@ -306,7 +325,7 @@ mod tests {
     #[test]
     fn queue_interrupt_drops_without_callback() {
         let mut s = InterruptState::default();
-        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
         assert_eq!(s.dropped, 1);
         assert!(s.pending.is_empty());
     }
@@ -315,9 +334,9 @@ mod tests {
     fn queue_interrupt_fifo_preserves_order() {
         let mut s = InterruptState::default();
         s.set_callback(0x1000, 0xAB);
-        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
-        s.queue_interrupt(INTERRUPT_SOURCE_CP);
-        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
+        s.queue_interrupt(INTERRUPT_SOURCE_CP, 2);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
         assert_eq!(s.dropped, 0);
         // FIFO: take_next hands them out in push order.
         assert_eq!(s.take_next(), Some(INTERRUPT_SOURCE_VSYNC));
@@ -331,11 +350,11 @@ mod tests {
         let mut s = InterruptState::default();
         s.set_callback(0x1000, 0xAB);
         for _ in 0..INTERRUPT_QUEUE_CAP {
-            s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+            s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
         }
         // Over-cap: drops rather than evicting the oldest.
-        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
-        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
+        s.queue_interrupt(INTERRUPT_SOURCE_VSYNC, VSYNC_TARGET_CPU);
         assert_eq!(s.dropped, 2);
         assert_eq!(s.pending.len(), INTERRUPT_QUEUE_CAP);
     }