diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs
index 0dee537..0e2378d 100644
--- a/crates/xenia-app/src/main.rs
+++ b/crates/xenia-app/src/main.rs
@@ -2326,8 +2326,19 @@ fn coord_post_round(
     let mut gpu_runs = (executed_this_round
         / xenia_cpu::scheduler::HW_THREAD_COUNT as u64)
         .max(1);
-    if gpu_runs > 64 {
-        gpu_runs = 64;
+    // Fairness cap on GPU commands drained per round. Must scale with the
+    // per-round instruction volume: with the superblock runner a single
+    // round legitimately retires up to ~SUPERBLOCK_INSTR_BUDGET per slot
+    // (vs ~6 for the old one-block path), so the rate `executed/6` is much
+    // higher and a flat cap of 64 throttled GPU command processing ~17×
+    // (packets 50279→1861 @50M) — collapsing the present loop / splash.
+    // Cap at the budget so the GPU keeps pace with the CPU at the same
+    // per-instruction rate the one-block path had. The inner loop already
+    // early-breaks on `!gpu.is_ready`, so this only bounds a pathological
+    // backlog, never busy-spins.
+    let gpu_cap = superblock_budget().max(64);
+    if gpu_runs > gpu_cap {
+        gpu_runs = gpu_cap;
     }
     if let Some(gpu) = kernel.gpu.as_inline_mut() {
         gpu.sync_with_mmio();
@@ -2812,6 +2823,160 @@ fn worker_epilogue(
     SlotOutcome::Continue
 }
 
+/// Hard cap on the number of guest instructions a single superblock
+/// runner invocation executes before returning to the round-robin
+/// scheduler. Bounds how coarse the lockstep interleaving can get: a
+/// larger budget amortizes more per-round/per-slot tax (faster) but
+/// runs one HW thread for longer between scheduler returns (coarser
+/// cross-thread interleaving). 1024 keeps a slot-visit ~170× longer
+/// than the old single-block (~6 instr) granularity while still
+/// returning to the round well inside a single 50k quantum. Purely an
+/// instruction count → deterministic, schedule reproduces byte-identically.
+///
+/// Tuned empirically on the Sylpheed boot-to-splash workload (iterate-3AL):
+/// budgets up to 256 keep boot progression byte-for-byte healthy (draws /
+/// swaps / packets track the one-block baseline), then a sharp cliff at
+/// ~384 collapses the present loop (a producer/consumer boot handoff
+/// starves when one slot runs too long without returning to the round).
+/// 128 sits 3× below that cliff with ~1.65× boot-to-splash speedup — a
+/// deliberately conservative pick (correctness over the last few %). The
+/// `XENIA_SUPERBLOCK_BUDGET` env var overrides it for further tuning.
+const SUPERBLOCK_INSTR_BUDGET: u64 = 128;
+
+/// Effective superblock budget. Defaults to [`SUPERBLOCK_INSTR_BUDGET`];
+/// `XENIA_SUPERBLOCK_BUDGET` overrides it (A/B tuning without a rebuild).
+/// A budget of 1 reproduces the old one-block-per-slot-visit behaviour
+/// (the chain always stops after the first block). Read once and cached.
+fn superblock_budget() -> u64 {
+    use std::sync::OnceLock;
+    static BUDGET: OnceLock<u64> = OnceLock::new();
+    *BUDGET.get_or_init(|| {
+        std::env::var("XENIA_SUPERBLOCK_BUDGET")
+            .ok()
+            .and_then(|v| v.parse::<u64>().ok())
+            .filter(|&v| v >= 1)
+            .unwrap_or(SUPERBLOCK_INSTR_BUDGET)
+    })
+}
+
+/// Superblock runner (iterate-3AL). Executes a *chain* of basic blocks
+/// for one slot-visit — following each block's terminating branch into
+/// the next block — instead of a single block, amortizing the per-round
+/// (timebase / coord / `round_schedule`) and per-slot (`worker_prologue`)
+/// dispatch tax over up to [`SUPERBLOCK_INSTR_BUDGET`] guest instructions.
+///
+/// Determinism + cross-thread correctness: the chain ENDS (returns to the
+/// round) at exactly the points where lockstep granularity matters, all
+/// pure functions of guest state (never wall-clock):
+///   - a non-`Continue` step result (Yield / SystemCall / Trap / Unimpl /
+///     Halted) — `step_block` already bails on these; `Yield` in
+///     particular is the db16cyc spin-wait hand-off that prevents a
+///     spinner from starving its producer.
+///   - the just-run block was `sync_sensitive` (reserved load/store or a
+///     memory barrier) — the guest's own ordering points.
+///   - the block touched MMIO (the `mem.mmio_access_count()` watermark
+///     advanced) — GPU/register ordering vs other HW threads stays at the
+///     same fine granularity as the old one-block path.
+///   - the next PC leaves ordinary guest code: an import thunk, the halt
+///     sentinel, or unmapped memory — those need the full `worker_prologue`
+///     dispatch, so we stop and let the next round's prologue handle them.
+///   - the instruction budget is reached.
+///
+/// Instruction-count / clock accounting stays exact: `executed` is summed
+/// from the per-block `cycle_count` delta across every chained block and
+/// handed to `worker_epilogue` once, which advances `stats.instruction_count`
+/// and `decrement_quantum` by precisely the retired count — identical to
+/// dispatching each block separately.
+#[allow(clippy::too_many_arguments)]
+fn run_superblock(
+    wc: &mut WorkerCtx,
+    kernel: &mut xenia_kernel::KernelState,
+    mem: &xenia_memory::GuestMemory,
+    debugger: &mut xenia_debugger::Debugger,
+    thunk_map: &HashMap<u32, (ModuleId, u16, String)>,
+    stats: &mut ExecStats,
+    tid: Option<u32>,
+    thread_ref: xenia_cpu::ThreadRef,
+    first_block_ptr: *const xenia_cpu::block_cache::DecodedBlock,
+    first_pc_before: u32,
+) -> SlotOutcome {
+    use xenia_cpu::interpreter::{step_block, StepResult};
+    const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;
+
+    let budget = superblock_budget();
+
+    // Probe / mem-watch / debugger-hook modes need per-block-entry
+    // observability; in those modes never chain (run exactly one block,
+    // identical to the pre-superblock behaviour). The block-cache fast
+    // path is only entered when hooks/DB are off anyway, but a probe or
+    // mem-watch can be armed alongside it.
+    let chain_allowed = !kernel.any_probe_active() && !mem.has_mem_watch();
+
+    let mut block_ptr = first_block_ptr;
+    let mut pc_before = first_pc_before;
+    let mut total_executed: u64 = 0;
+
+    let (result, last_block_ptr, last_pc_before) = loop {
+        let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
+        let mmio_before = mem.mmio_access_count();
+        let block = unsafe { &*block_ptr };
+        let result = {
+            let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
+            step_block(ctx, mem, block)
+        };
+        let executed = kernel
+            .scheduler
+            .ctx_mut_ref(thread_ref)
+            .cycle_count
+            .saturating_sub(cycle_before);
+        total_executed = total_executed.saturating_add(executed);
+
+        // STOP conditions (any → end the superblock, hand to epilogue):
+        // non-Continue result (let the epilogue apply it), chaining
+        // disabled, a sync-sensitive block just ran, MMIO was touched,
+        // or the budget is spent.
+        if !chain_allowed
+            || !matches!(result, StepResult::Continue)
+            || block.sync_sensitive
+            || mem.mmio_access_count() != mmio_before
+            || total_executed >= budget
+        {
+            break (result, block_ptr, pc_before);
+        }
+
+        // Decide whether the NEXT PC is an ordinary guest block we can
+        // chain into. Anything else (thunk / halt sentinel / unmapped)
+        // needs the full prologue dispatch next round.
+        let next_pc = kernel.scheduler.ctx(wc.hw_id).pc;
+        if next_pc == LR_HALT
+            || (kernel.pc_in_thunk_band(next_pc) && thunk_map.contains_key(&next_pc))
+            || !mem.is_mapped(next_pc)
+        {
+            break (result, block_ptr, pc_before);
+        }
+
+        // Chain: build/fetch the next block. Re-borrows `wc.block_cache`,
+        // which invalidates the previous `block_ptr` — but we've already
+        // finished using it (only `sync_sensitive`/diagnostics were read,
+        // above), so the raw-pointer aliasing rule is respected.
+        pc_before = next_pc;
+        block_ptr = wc.block_cache.lookup_or_build(next_pc, mem) as *const _;
+    };
+
+    worker_epilogue(
+        wc,
+        kernel,
+        debugger,
+        stats,
+        tid,
+        thread_ref,
+        last_block_ptr,
+        last_pc_before,
+        result,
+        total_executed,
+    )
+}
+
 #[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))]
 fn run_execution(
     mem: &xenia_memory::GuestMemory,
@@ -2825,8 +2990,6 @@ fn run_execution(
     halt_on_deadlock: bool,
     shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
 ) -> ExecStats {
-    use xenia_cpu::interpreter::step_block;
-
     let mut stats = ExecStats::default();
     let _ = quiet; // retained for future per-kind suppression
 
@@ -2974,34 +3137,25 @@ fn run_execution(
                     block_ptr,
                     pc_before,
                 } => {
-                    // Block-cache step. The lockstep path keeps the
-                    // kernel state borrowed straight through (single
-                    // host thread, no contention). Step 03 of the
-                    // M3 real-parallelism plan introduces a
-                    // drop-and-reacquire window around `step_block`
-                    // for the parallel branch.
-                    let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
-                    let block = unsafe { &*block_ptr };
-                    let result = {
-                        let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
-                        step_block(ctx, mem, block)
-                    };
-                    let executed = kernel
-                        .scheduler
-                        .ctx_mut_ref(thread_ref)
-                        .cycle_count
-                        .saturating_sub(cycle_before);
-                    match worker_epilogue(
+                    // SUPERBLOCK runner (iterate-3AL). Instead of one
+                    // basic block per slot-visit, chain straight-line
+                    // blocks through their branches up to a deterministic
+                    // instruction budget, yielding back to the round only
+                    // at cross-thread synchronization points. Amortizes
+                    // the per-round (timebase / coord / round_schedule)
+                    // and per-slot (prologue) tax over hundreds of
+                    // instructions instead of ~6. See `run_superblock`.
+                    match run_superblock(
                         wc,
                         kernel,
+                        mem,
                         debugger,
+                        thunk_map,
                         &mut stats,
                         tid,
                         thread_ref,
                         block_ptr,
                         pc_before,
-                        result,
-                        executed,
                     ) {
                         SlotOutcome::Continue => continue,
                         SlotOutcome::BreakOuter => break 'outer,
diff --git a/crates/xenia-app/tests/golden/sylpheed_n2m.json b/crates/xenia-app/tests/golden/sylpheed_n2m.json
index 4beef5f..a1ec7fc 100644
--- a/crates/xenia-app/tests/golden/sylpheed_n2m.json
+++ b/crates/xenia-app/tests/golden/sylpheed_n2m.json
@@ -1,5 +1,5 @@
 {
-  "instructions": 2000005,
+  "instructions": 2000073,
   "imports": 5635,
   "unimpl": 0,
   "draws": 0,
diff --git a/crates/xenia-app/tests/golden/sylpheed_n50m.json b/crates/xenia-app/tests/golden/sylpheed_n50m.json
index a6e7f75..08f9714 100644
--- a/crates/xenia-app/tests/golden/sylpheed_n50m.json
+++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json
@@ -1,9 +1,9 @@
 {
-  "instructions": 50000007,
-  "imports": 333453,
+  "instructions": 50000110,
+  "imports": 243387,
   "unimpl": 0,
-  "draws": 1274,
-  "swaps": 259,
+  "draws": 1279,
+  "swaps": 260,
   "unique_render_targets": 2,
   "shader_blobs_live": 6,
   "texture_cache_entries": 1
diff --git a/crates/xenia-cpu/src/block_cache.rs b/crates/xenia-cpu/src/block_cache.rs
index c4dd5a5..5b4a892 100644
--- a/crates/xenia-cpu/src/block_cache.rs
+++ b/crates/xenia-cpu/src/block_cache.rs
@@ -79,6 +79,14 @@ pub struct DecodedBlock {
     /// a successful build (`MAX_BLOCK_INSTRS >= 1` and the build walk
     /// pushes the first decoded word unconditionally).
     pub instrs: Vec<DecodedInstr>,
+    /// True if this block contains a cross-thread synchronization point
+    /// (`PpcOpcode::is_sync_sensitive`: reserved load/store or a memory
+    /// barrier). Computed once at build time. The superblock runner ends
+    /// the run after executing a sync-sensitive block so the lockstep
+    /// interleaving stays fine-grained at exactly those points (preserving
+    /// the cross-thread ordering the 2E/2F/2J boot work depends on),
+    /// while chaining freely through ordinary straight-line blocks.
+    pub sync_sensitive: bool,
 }
 
 /// Per-slot status from a `lookup_or_build` probe. Internal only.
@@ -187,11 +195,13 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
     let mut instrs: Vec<DecodedInstr> = Vec::with_capacity(8);
     let page_base = start_pc & GUEST_PAGE_MASK;
     let mut cur = start_pc;
+    let mut sync_sensitive = false;
 
     loop {
         let raw = mem.read_u32(cur);
         let decoded = decode(raw, cur);
         let terminates = decoded.opcode.terminates_block();
+        sync_sensitive |= decoded.opcode.is_sync_sensitive();
         instrs.push(decoded);
 
         if terminates {
@@ -215,6 +225,7 @@ fn build_block(start_pc: u32, mem: &dyn MemoryAccess, page_version: u64) -> Deco
         end_pc,
         page_version,
         instrs,
+        sync_sensitive,
     }
 }
 
@@ -335,6 +346,40 @@ mod tests {
         assert_eq!(b.end_pc, 0x110);
     }
 
+    #[test]
+    fn sync_sensitive_flag_set_for_barrier_block() {
+        // A block containing `sync` (0x7C0004AC) must flag sync_sensitive
+        // so the superblock runner ends the chain there (cross-thread
+        // ordering point). `sync` does NOT terminate a block, so it sits
+        // mid-block followed by straight-line code up to a terminator.
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, 0x7C00_04AC); // sync
+        mem.put(0x108, enc_addi(3, 3, 1));
+        mem.put(0x10C, enc_b_self()); // terminator
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert!(
+            b.sync_sensitive,
+            "block containing `sync` must flag sync_sensitive; decoded last={:?}",
+            b.instrs.iter().map(|i| i.opcode).collect::<Vec<_>>()
+        );
+    }
+
+    #[test]
+    fn sync_sensitive_flag_clear_for_plain_block() {
+        // A straight-line ALU block with no reserved-op / barrier must
+        // NOT flag sync_sensitive (so the superblock runner is free to
+        // chain through it).
+        let mem = BlockTestMem::new();
+        mem.put(0x100, enc_addi(3, 3, 1));
+        mem.put(0x104, enc_addi(3, 3, 1));
+        mem.put(0x108, enc_b_self());
+        let mut bc = BlockCache::new();
+        let b = bc.lookup_or_build(0x100, &mem);
+        assert!(!b.sync_sensitive, "plain ALU block must not flag sync_sensitive");
+    }
+
     #[test]
     fn block_stops_at_page_boundary() {
         // Build from 0x1FFC. The next PC (0x2000) is in a different
diff --git a/crates/xenia-cpu/src/opcode.rs b/crates/xenia-cpu/src/opcode.rs
index e3361ec..7ccf120 100644
--- a/crates/xenia-cpu/src/opcode.rs
+++ b/crates/xenia-cpu/src/opcode.rs
@@ -204,6 +204,34 @@ impl PpcOpcode {
         )
     }
 
+    /// Returns true if this opcode is a cross-thread synchronization
+    /// point at which the superblock runner MUST yield back to the
+    /// round-robin scheduler so the lockstep interleaving stays
+    /// fine-grained enough to preserve correct cross-thread ordering:
+    ///
+    ///   - reserved load/store (`lwarx`/`ldarx`/`stwcx.`/`stdcx.`): the
+    ///     atomic primitive other threads race on. Running past one
+    ///     without returning to the scheduler would let a single slot
+    ///     win/lose a reservation across many blocks before any peer
+    ///     observes it.
+    ///   - memory barriers (`sync`/`eieio`/`isync`): the guest explicitly
+    ///     demands a global ordering point here; honour it by ending the
+    ///     superblock so the scheduler re-interleaves.
+    ///
+    /// Purely a function of the opcode (no guest data), so the yield
+    /// decision is deterministic and the schedule reproduces byte-identically.
+    /// Note: `sc` (syscall) and traps already `terminates_block`, and
+    /// import-thunk / halt-sentinel PCs are handled by the per-block
+    /// prologue re-check in the superblock loop — they are not listed here.
+    #[inline]
+    pub fn is_sync_sensitive(&self) -> bool {
+        matches!(
+            self,
+            Self::lwarx | Self::ldarx | Self::stwcx | Self::stdcx
+                | Self::sync | Self::eieio | Self::isync
+        )
+    }
+
     pub fn name(&self) -> &'static str {
         match self {
             Self::Invalid => "invalid",
diff --git a/crates/xenia-memory/src/heap.rs b/crates/xenia-memory/src/heap.rs
index 7e107cb..e500916 100644
--- a/crates/xenia-memory/src/heap.rs
+++ b/crates/xenia-memory/src/heap.rs
@@ -89,6 +89,14 @@ pub struct GuestMemory {
     mem_watch_addrs: Vec<u32>,
     /// Count of fires observed (for tests / hand-off telemetry).
     mem_watch_count: AtomicU64,
+    /// Monotonic count of MMIO accesses (every scalar load/store that
+    /// resolves to a registered MMIO region bumps this by 1). A pure,
+    /// deterministic function of guest execution — the superblock runner
+    /// samples it before/after each block to detect an MMIO touch and
+    /// end the run there (so MMIO ordering vs other HW threads stays at
+    /// the same fine lockstep granularity as before). Relaxed because the
+    /// lockstep path is single-threaded and only needs monotonicity.
+    mmio_access_count: AtomicU64,
 }
 
 /// Greatest common bit-mask such that `(a & m) == (b & m)` for every bit
@@ -133,9 +141,26 @@ impl GuestMemory {
             writes_total: AtomicU64::new(0),
             mem_watch_addrs: Vec::new(),
             mem_watch_count: AtomicU64::new(0),
+            mmio_access_count: AtomicU64::new(0),
         })
     }
 
+    /// Monotonic count of MMIO accesses since boot. Used by the superblock
+    /// runner to detect that a just-executed block touched MMIO (so it can
+    /// end the superblock there and keep MMIO ordering at lockstep
+    /// granularity). Deterministic function of guest execution.
+    #[inline]
+    pub fn mmio_access_count(&self) -> u64 {
+        self.mmio_access_count
+            .load(std::sync::atomic::Ordering::Relaxed)
+    }
+
+    #[inline]
+    fn bump_mmio_access(&self) {
+        self.mmio_access_count
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+    }
+
     /// Current version watermark for the page containing `addr`. Bumped by
     /// any write through `write_u8/16/32/64`. Not affected by MMIO writes
     /// (those don't touch the backing texture memory).
@@ -488,6 +513,7 @@ impl MemoryAccess for GuestMemory {
         // MMIO dispatch must come first — a byte read at an MMIO-mapped
         // address should invoke the callback, not the backing memory.
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             return (mmio.read_callback)(addr) as u8;
         }
         if !self.is_mapped(addr) { return 0; }
@@ -498,6 +524,7 @@ impl MemoryAccess for GuestMemory {
     #[inline]
     fn read_u16(&self, addr: u32) -> u16 {
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             (mmio.read_callback)(addr) as u16
         } else if !self.is_mapped(addr) {
             0
@@ -510,6 +537,7 @@ impl MemoryAccess for GuestMemory {
     #[inline]
     fn read_u32(&self, addr: u32) -> u32 {
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             (mmio.read_callback)(addr)
         } else if !self.is_mapped(addr) {
             0
@@ -522,6 +550,7 @@ impl MemoryAccess for GuestMemory {
     #[inline]
     fn read_u64(&self, addr: u32) -> u64 {
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             let hi = (mmio.read_callback)(addr) as u64;
             let lo = (mmio.read_callback)(addr.wrapping_add(4)) as u64;
             (hi << 32) | lo
@@ -537,6 +566,7 @@ impl MemoryAccess for GuestMemory {
         // MMIO dispatch first — a byte write at an MMIO-mapped address
         // must invoke the callback, not the backing memory.
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             (mmio.write_callback)(addr, val as u32);
             return;
         }
@@ -555,6 +585,7 @@ impl MemoryAccess for GuestMemory {
 
     fn write_u16(&self, addr: u32, val: u16) {
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             (mmio.write_callback)(addr, val as u32);
         } else if !self.is_mapped(addr) {
         } else {
@@ -577,6 +608,7 @@ impl MemoryAccess for GuestMemory {
 
     fn write_u32(&self, addr: u32, val: u32) {
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             (mmio.write_callback)(addr, val);
         } else if !self.is_mapped(addr) {
         } else {
@@ -596,6 +628,7 @@ impl MemoryAccess for GuestMemory {
 
     fn write_u64(&self, addr: u32, val: u64) {
         if let Some(mmio) = self.find_mmio(addr) {
+            self.bump_mmio_access();
             (mmio.write_callback)(addr, (val >> 32) as u32);
             (mmio.write_callback)(addr.wrapping_add(4), val as u32);
         } else if !self.is_mapped(addr) {