ITERATE-2.V: scheduler priority aging closes 18-day AUDIT-049 wedge

Priority aging in xenia-cpu/scheduler.rs:pick_runnable (effective_priority = base + age_bonus(now_round - last_run_round), capped at +31, AGING_ROUNDS_PER_BONUS=1). Strict-priority was parking priority=0 threads behind CPU-bound priority=15 audio mixer (sub_824D1328 guest spinwait at PC=0x824d1404 on CPU5). Aging eventually picks the starved thread, breaking the producer-consumer cycle that caused 5-tid wedge at PC=0x824ac578 since AUDIT-049 (10 May). Cascade observed: tid=13 clean exit; events 121K -> 13M (107x); last host_ns 767ms -> 51,011ms (66x); 8 new threads spawn; VdSwap 1 -> 2. Complete two-day iterate sequence (2026-05-27 -> 2026-05-28): - 2.F: VdSwap drain timeout 900ms -> 1ms (xenia-gpu/handle.rs); 876x perf win on VdSwap kernel callback - 2.H: vA0000000 physical heap bucket added (state.rs, exports.rs); ctx_ptrs now in 0xA0000000-0xBFFFFFFF range matching canary - 2.L: Phase-A diff harness categorized [return_value mismatch], [status mismatch], [args_resolved.path mismatch] tags (tools/diff-events/diff_events.py); closes reading-error #41 (silent test-harness state leak invalidating trace diffs) - 2.M: always-on exit-thread-state.json sibling to Phase-A JSONL (event_log.rs + xenia-app/main.rs); closes reading-error #42 (Phase-A blind to blocked-forever waits) - 2.Q: signal.match kernel instrumentation in NtSetEvent / NtReleaseSemaphore / KeSetEvent / KeReleaseSemaphore (exports.rs); emits target_handle + waiter_count + waiter_tids - 2.T: wake.requested kernel instrumentation in wake_eligible_waiters (exports.rs); emits target_tid + transition + new_state - 2.V: scheduler priority aging (xenia-cpu/scheduler.rs) [keystone] Plus accumulated WIP from earlier May (contention_manifest, phase_b_snapshot, xam/xaudio enhancements, analysis db, xex loader, xenia-app main loop, etc.). Audit-runs/ artifacts remain untracked per project convention. Tests: 300 xenia-cpu / 227 xenia-kernel / 5 xenia-app / 19 xenia-path / 30+ smaller suites -- all PASS, 0 regressions. Determinism preserved (2x cold runs bit-identical at 13,003,881 events post-2.V). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 07:27:26 +02:00
parent e6d43a23ac
commit ad45873a1b
50 changed files with 14389 additions and 506 deletions
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -117,17 +117,27 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -
            ctx.pc += 4;
        }
        PpcOpcode::addis => {
-            // Xbox 360 user mode is 32-bit ABI (MSR.SF=0), so addis must
-            // produce a value whose upper 32 bits don't pollute downstream
-            // 64-bit arithmetic. The PPC ISA in 64-bit mode sign-extends
-            // simm16 before the shift, producing 0xFFFFFFFF_xxxx0000 for
-            // negative simm16 (high bit set). When this value flows into
-            // a 64-bit subfc against a zero-extended lwz value, the unsigned
-            // 64-bit comparison yields wrong CA. Truncate to 32 bits to
-            // simulate 32-bit ABI behavior.
-            let ra_val = if instr.ra() == 0 { 0u64 } else { ctx.gpr[instr.ra()] };
-            let result = ra_val.wrapping_add((instr.simm16() as i64 as u64) << 16);
-            ctx.gpr[instr.rd()] = result as u32 as u64;
+            // Phase C+23: `addis` (and the `lis` simplified mnemonic) must
+            // sign-extend the shifted immediate to the full 64 bits before
+            // storing into the GPR, matching canary's HIR emitter
+            // (`InstrEmit_addis` in `ppc_emit_alu.cc`: `EXTS16(SI) << 16`
+            // as a 64-bit constant). Game code commonly builds a negative
+            // 32-bit value via `lis rN, 0xFFFB; ori rN, rN, 0x6C20`
+            // (yielding the i32 -300,000 for a 30ms `KeWait` timeout) and
+            // then stores it as a 64-bit doubleword via `std`. Without
+            // sign extension the high half on the wire was 0x00000000,
+            // turning the timeout into a positive ~4.3-billion-tick
+            // absolute deadline (~7 minutes) instead of a 30ms relative
+            // wait — surfacing as `wait.begin.timeout_ns=429466729600`
+            // on canary tid=12 → ours tid=7 idx=3 sister chain
+            // (cold-vs-cold C+22 baseline). Defensive 32-bit truncation
+            // for the arithmetic chain consumers (`subfcx`/`addex`/etc.)
+            // is already implemented at each consumer site (see PPCBUG-002/
+            // 007/etc.), so widening `addis` here does NOT regress them.
+            let ra_val = if instr.ra() == 0 { 0i64 } else { ctx.gpr[instr.ra()] as i64 };
+            let shifted = (instr.simm16() as i64) << 16;
+            let result = ra_val.wrapping_add(shifted);
+            ctx.gpr[instr.rd()] = result as u64;
            ctx.pc += 4;
        }
        PpcOpcode::addic => {
@@ -4934,6 +4944,92 @@ mod tests {
        assert_eq!(ctx.gpr[3], 0x10000);
    }

+    /// Phase C+23 regression: `addis rD, 0, neg_simm` (the `lis` form
+    /// with a negative immediate) must sign-extend the result to the
+    /// full 64 bits, matching canary's HIR emitter. Without this fix,
+    /// game code that builds a 32-bit negative value via
+    /// `lis r11, 0xFFFB; ori r11, r11, 0x6C20` and then stores the
+    /// result as a 64-bit doubleword via `std` would put 0x00000000
+    /// in the high half instead of the correct 0xFFFFFFFF, turning a
+    /// 30 ms relative `KeWaitForSingleObject` timeout into a positive
+    /// absolute deadline ~7 minutes away. Anchored by the cold-vs-cold
+    /// sister chain canary tid=12 → ours tid=7 idx=3 divergence.
+    #[test]
+    fn addis_with_negative_simm_sign_extends_to_64_bits() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // addis r11, r0, 0xFFFB (lis r11, 0xFFFB)
+        // op=15, rd=11, ra=0, simm=0xFFFB.
+        let raw = (15u32 << 26) | (11u32 << 21) | (0u32 << 16) | 0xFFFBu32;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(
+            ctx.gpr[11], 0xFFFFFFFF_FFFB0000u64,
+            "addis with negative simm must sign-extend to 64 bits"
+        );
+    }
+
+    /// Phase C+23 regression: the full `lis + ori + std` sequence that
+    /// builds the −300,000 timeout tick count used by Sylpheed for its
+    /// 30 ms `KeWait` calls must produce 0xFFFFFFFFFFFB6C20 on the wire,
+    /// not 0x00000000FFFB6C20. This is the proximate cause of the
+    /// `wait.begin.timeout_ns = 429466729600` divergence on canary tid=12
+    /// → ours tid=7 idx=3 in the cold-vs-cold C+22 baseline.
+    #[test]
+    fn lis_ori_std_negative_timeout_writes_sign_extended_doubleword() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // r1 = 0x100 (stack pointer surrogate). Storage slot at r1+8.
+        ctx.gpr[1] = 0x100;
+        // lis r11, 0xFFFB           ; r11 = 0xFFFFFFFFFFFB0000
+        let lis = (15u32 << 26) | (11u32 << 21) | (0u32 << 16) | 0xFFFBu32;
+        // ori r11, r11, 0x6C20      ; r11 = 0xFFFFFFFFFFFB6C20
+        // op=24 (ori): D-form encoding | rs(11) | ra(11) | uimm.
+        let ori = (24u32 << 26) | (11u32 << 21) | (11u32 << 16) | 0x6C20u32;
+        // std r11, 8(r1)            ; mem[0x108..0x110] = 0xFFFFFFFFFFFB6C20
+        // op=62, DS-form, ds_field=8>>2=2, xo=0.
+        let std_op = (62u32 << 26) | (11u32 << 21) | (1u32 << 16) | (8u32 & 0xFFFCu32);
+        write_instr(&mut mem, 0, lis);
+        write_instr(&mut mem, 4, ori);
+        write_instr(&mut mem, 8, std_op);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem); // lis
+        assert_eq!(ctx.gpr[11], 0xFFFFFFFF_FFFB0000u64);
+        step(&mut ctx, &mut mem); // ori
+        assert_eq!(ctx.gpr[11], 0xFFFFFFFF_FFFB6C20u64);
+        step(&mut ctx, &mut mem); // std
+        let stored = mem.read_u64(0x108);
+        assert_eq!(
+            stored, 0xFFFFFFFF_FFFB6C20u64,
+            "std must persist all 64 bits of the sign-extended GPR"
+        );
+        // Interpreting the stored doubleword as a 100ns NT TIMEOUT tick
+        // count: it must round-trip to −300,000 (30 ms relative wait),
+        // NOT to +4,294,667,296 (the C+22 broken value).
+        assert_eq!(stored as i64, -300_000i64);
+        assert_eq!((stored as i64).wrapping_mul(100), -30_000_000i64);
+    }
+
+    /// Phase C+23 regression: ensure `addis` against a non-zero rA still
+    /// performs the canonical Add with 64-bit semantics. Used by
+    /// arithmetic chains that combine a sign-extended `lis` high half
+    /// with a subsequent `addi` low half. Equivalent to canary's HIR
+    /// `Add(LoadGPR(rA), const_i64(simm << 16))`.
+    #[test]
+    fn addis_with_nonzero_ra_adds_in_64_bit() {
+        let mut ctx = PpcContext::new();
+        let mut mem = TestMem::new();
+        // r4 = 0x1234 already. addis r5, r4, 0xFFFE => r5 = r4 + (-2<<16)
+        //                                            = 0x1234 + 0xFFFFFFFFFFFE0000
+        ctx.gpr[4] = 0x1234;
+        let raw = (15u32 << 26) | (5u32 << 21) | (4u32 << 16) | 0xFFFEu32;
+        write_instr(&mut mem, 0, raw);
+        ctx.pc = 0;
+        step(&mut ctx, &mut mem);
+        assert_eq!(ctx.gpr[5], 0xFFFFFFFF_FFFE1234u64);
+    }
+
    #[test]
    fn test_lwz_stw() {
        let mut ctx = PpcContext::new();