ITERATE-2.V: scheduler priority aging closes 18-day AUDIT-049 wedge

Priority aging in xenia-cpu/scheduler.rs:pick_runnable (effective_priority = base + age_bonus(now_round - last_run_round), capped at +31, AGING_ROUNDS_PER_BONUS=1). Strict-priority was parking priority=0 threads behind CPU-bound priority=15 audio mixer (sub_824D1328 guest spinwait at PC=0x824d1404 on CPU5). Aging eventually picks the starved thread, breaking the producer-consumer cycle that caused 5-tid wedge at PC=0x824ac578 since AUDIT-049 (10 May). Cascade observed: tid=13 clean exit; events 121K -> 13M (107x); last host_ns 767ms -> 51,011ms (66x); 8 new threads spawn; VdSwap 1 -> 2. Complete two-day iterate sequence (2026-05-27 -> 2026-05-28): - 2.F: VdSwap drain timeout 900ms -> 1ms (xenia-gpu/handle.rs); 876x perf win on VdSwap kernel callback - 2.H: vA0000000 physical heap bucket added (state.rs, exports.rs); ctx_ptrs now in 0xA0000000-0xBFFFFFFF range matching canary - 2.L: Phase-A diff harness categorized [return_value mismatch], [status mismatch], [args_resolved.path mismatch] tags (tools/diff-events/diff_events.py); closes reading-error #41 (silent test-harness state leak invalidating trace diffs) - 2.M: always-on exit-thread-state.json sibling to Phase-A JSONL (event_log.rs + xenia-app/main.rs); closes reading-error #42 (Phase-A blind to blocked-forever waits) - 2.Q: signal.match kernel instrumentation in NtSetEvent / NtReleaseSemaphore / KeSetEvent / KeReleaseSemaphore (exports.rs); emits target_handle + waiter_count + waiter_tids - 2.T: wake.requested kernel instrumentation in wake_eligible_waiters (exports.rs); emits target_tid + transition + new_state - 2.V: scheduler priority aging (xenia-cpu/scheduler.rs) [keystone] Plus accumulated WIP from earlier May (contention_manifest, phase_b_snapshot, xam/xaudio enhancements, analysis db, xex loader, xenia-app main loop, etc.). Audit-runs/ artifacts remain untracked per project convention. Tests: 300 xenia-cpu / 227 xenia-kernel / 5 xenia-app / 19 xenia-path / 30+ smaller suites -- all PASS, 0 regressions. Determinism preserved (2x cold runs bit-identical at 13,003,881 events post-2.V). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 07:27:26 +02:00
parent e6d43a23ac
commit ad45873a1b
50 changed files with 14389 additions and 506 deletions
--- a/crates/xenia-gpu/src/gpu_system.rs
+++ b/crates/xenia-gpu/src/gpu_system.rs
@@ -339,6 +339,23 @@ pub struct GpuSystem {
    /// `GpuSystem::new` and lives for the whole GPU lifetime — no
    /// per-frame churn.
    pub edram: crate::edram::ShadowEdram,
+    /// 256-entry `DC_LUT_30_COLOR` gamma ramp (10-bit BGR packed per entry).
+    /// Mirrors canary's `gamma_ramp_256_entry_table_` array on
+    /// `CommandProcessor` (`command_processor.cc:130-148`).  Pre-loaded
+    /// with the linear sRGB ramp at construction so any code path that
+    /// queries gamma before the guest writes its own ramp sees the same
+    /// initial values as canary.  MMIO read/write index handling for
+    /// `DC_LUT_RW_INDEX` is NOT yet wired in ours, so guests can't access
+    /// these bytes today; the field exists for state parity and to give
+    /// future MMIO handlers a populated buffer.
+    pub gamma_ramp_256: Vec<u32>,
+    /// 128-entry per-channel `DC_LUT_PWL_DATA` gamma ramp (base/delta pairs,
+    /// stored interleaved RGB → 384 u32 entries).  Layout matches
+    /// `gamma_ramp_pwl_rgb_[i][j]` in canary (`command_processor.cc:141-148`):
+    /// index = `i * 3 + j` where `i ∈ [0,128)` and `j ∈ {0,1,2}` for R/G/B.
+    /// Same status as `gamma_ramp_256`: state-parity only until MMIO
+    /// handlers are added.
+    pub gamma_ramp_pwl: Vec<u32>,
 }

 impl GpuSystem {
@@ -365,9 +382,47 @@ impl GpuSystem {
            last_resolve: None,
            texture_cache: crate::texture_cache::TextureCache::new(),
            edram: crate::edram::ShadowEdram::new(),
+            gamma_ramp_256: Self::default_gamma_ramp_256(),
+            gamma_ramp_pwl: Self::default_gamma_ramp_pwl(),
        }
    }

+    /// Build canary's default 256-entry sRGB linear ramp.  Per
+    /// `command_processor.cc:134-140`: for each `i ∈ [0,256)`, the 10-bit
+    /// per-channel value is `i * 0x3FF / 0xFF`; the BGR triple is packed
+    /// into a single `DC_LUT_30_COLOR` u32.  The packing here is BGR-low
+    /// to match canary's `color_10_blue` / `green` / `red` field order
+    /// (low bits = blue, high bits = red).
+    fn default_gamma_ramp_256() -> Vec<u32> {
+        let mut v = Vec::with_capacity(256);
+        for i in 0..256u32 {
+            let lane = (i * 0x3FF) / 0xFF;
+            // DC_LUT_30_COLOR bit layout: blue[0..10] | green[10..20] | red[20..30].
+            let entry = lane | (lane << 10) | (lane << 20);
+            v.push(entry);
+        }
+        v
+    }
+
+    /// Build canary's default 128-entry PWL ramp (interleaved RGB →
+    /// 384 u32s).  Per `command_processor.cc:141-148`: for each
+    /// `i ∈ [0,128)`, `base = (i * 0xFFFF / 0x7F) & ~0x3F`, and
+    /// `delta = 0x200` when `i < 0x7F` else `0`.  Same value mirrored
+    /// across R/G/B (j=0/1/2).  Each `DC_LUT_PWL_DATA` is one u32
+    /// (`base` in low 16, `delta` in high 16).
+    fn default_gamma_ramp_pwl() -> Vec<u32> {
+        let mut v = Vec::with_capacity(128 * 3);
+        for i in 0..128u32 {
+            let base = ((i * 0xFFFF) / 0x7F) & !0x3Fu32;
+            let delta: u32 = if i < 0x7F { 0x200 } else { 0 };
+            let entry = (base & 0xFFFF) | ((delta & 0xFFFF) << 16);
+            for _ in 0..3 {
+                v.push(entry);
+            }
+        }
+        v
+    }
+
    /// P8 — insert a shader blob + bump the FIFO so long-running games
    /// don't grow `shader_blobs` without bound. Caps at [`SHADER_BLOB_CAP`].
    /// Never evicts the currently-active VS/PS blobs (if they ended up at
--- a/crates/xenia-gpu/src/handle.rs
+++ b/crates/xenia-gpu/src/handle.rs
@@ -390,7 +390,17 @@ impl GpuBackend {
                // fires; the safety-net fallback warning fired twice for
                // each Sylpheed run.
                let target = s.mmio.cp_rb_wptr.load(Ordering::Acquire);
-                s.drain_until_wptr(mem, target, Duration::from_millis(900))
+                // GPUBUG-DRAIN-001 (iterate-2F, 2026-05-27): cap the inline
+                // drain at 1 ms so vd_swap does not block the main guest
+                // thread for ~900 ms per swap. Canary's `VdSwap_entry`
+                // returns in ~6.6 us — no synchronous drain. The 900 ms
+                // deadline parked tid=1 long enough to starve the post-swap
+                // worker fan-out at `sub_825070F0`, which in turn left
+                // tid=13's wait predicate unsatisfiable (wedge at
+                // PC=0x821CB1DC). Remaining packets stay queued in the
+                // ring; the next drain (next vd_swap or kernel-callback
+                // boundary) consumes them.
+                s.drain_until_wptr(mem, target, Duration::from_millis(1))
            }
            GpuBackend::Threaded(h) => {
                let target_wptr = h.mmio.cp_rb_wptr.load(Ordering::Acquire);
@@ -560,7 +570,12 @@ impl GpuWorker {
                        // empty (rptr == wptr after modulo) or a packet
                        // returns `Idle`/`Blocked`.
                        self.system.sync_with_mmio();
-                        let deadline = Instant::now() + Duration::from_millis(900);
+                        // GPUBUG-DRAIN-001 (iterate-2F, 2026-05-27): cap at
+                        // 1 ms so the CPU's `recv_timeout(1s)` returns
+                        // promptly. Canary doesn't synchronously drain in
+                        // VdSwap; mirroring that frees tid=1 to spawn
+                        // post-swap workers in time.
+                        let deadline = Instant::now() + Duration::from_millis(1);
                        while self.system.is_ready(&*memory) {
                            if Instant::now() >= deadline {
                                break;