xenia-app: observability subsystem, --parallel runtime, stress harness

observability.rs installs the tracing subscriber stack (env-filter +
JSON file appender + chrome trace + error layer) and the metrics
recorder shared by the workspace. main.rs grows the new CLI surface:
--parallel, --reservations-table, --trace-handles, --analyze=
{rust,sql,both}, xenia dis --json, --ui, plus the wiring that runs
the CPU through the new scheduler, drives the GPU's threaded backend,
and surfaces the framebuffer + HUD via xenia-ui.

Add tests/parallel_stress.rs (#[ignore]-gated long form, short form
runs 20×@5M) and tests/golden/sylpheed_n2m.json — the digest the
lockstep/parallel combos compare against.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-01 16:30:26 +02:00
parent b1285ba560
commit bae9305982
5 changed files with 3169 additions and 170 deletions

View File

@@ -0,0 +1,16 @@
{
"path": "/home/fabi/RE Project Sylpheed/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso",
"instructions": 2000000,
"imports": 5634,
"unimpl": 0,
"packets": 0,
"draws": 0,
"swaps": 0,
"resolves": 0,
"unique_render_targets": 0,
"shader_blobs_live": 0,
"interrupts_delivered": 0,
"interrupts_dropped": 13,
"texture_cache_entries": 0,
"texture_decodes": 0
}

View File

@@ -0,0 +1,111 @@
//! M3 real-parallelism stress harness.
//!
//! Runs `xenia-rs check sylpheed.iso --parallel --halt-on-deadlock`
//! many times back-to-back to surface lost-wakeups, lock-order
//! inversions, and ABA hazards that a single run wouldn't reliably
//! reproduce. Failures dump per-run stdout/stderr to
//! `target/parallel-stress-NNN.{stdout,stderr}` for post-mortem.
//!
//! Two configurations:
//! - `parallel_stress_short`: 20 runs at -n 5_000_000. Quick smoke
//! check — runs in a few minutes on the current substrate.
//! - `parallel_stress_long` (ignored, opt-in): 100 runs at
//! -n 50_000_000. The full gate from the master plan; expected
//! runtime is hours until the perf gap (Step 05's deferred parking
//! fix) closes.
//!
//! Run with `cargo test --release -p xenia-app --test parallel_stress
//! -- --ignored --nocapture` for the full 100x; otherwise the short
//! variant runs as part of the normal test suite when explicitly
//! invoked: `cargo test --release -p xenia-app --test parallel_stress
//! -- --nocapture parallel_stress_short`.
use std::process::Command;
use std::time::Instant;
const ISO_DEFAULT: &str = "/home/fabi/RE Project Sylpheed/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso";
fn iso_path() -> String {
std::env::var("SYLPHEED_ISO").unwrap_or_else(|_| ISO_DEFAULT.to_string())
}
fn run_stress(label: &str, runs: u32, max_instr: u64) {
let bin = env!("CARGO_BIN_EXE_xenia-rs");
let iso = iso_path();
if !std::path::Path::new(&iso).exists() {
eprintln!("{label}: iso not found at {iso}; set SYLPHEED_ISO to override. SKIPPING.");
return;
}
std::fs::create_dir_all("target").ok();
let mut failures: u32 = 0;
let mut wall_ms: Vec<u128> = Vec::with_capacity(runs as usize);
let max_instr_str = max_instr.to_string();
for run in 1..=runs {
let t0 = Instant::now();
let out = Command::new(bin)
.args([
"exec",
&iso,
"-n",
&max_instr_str,
"--parallel",
"--halt-on-deadlock",
"--quiet",
])
.output()
.expect("failed to spawn xenia-rs");
let dt = t0.elapsed().as_millis();
wall_ms.push(dt);
let exit_ok = out.status.success();
let vdswap2 = String::from_utf8_lossy(&out.stderr).contains("VdSwap")
|| String::from_utf8_lossy(&out.stdout).contains("VdSwap");
let _ = vdswap2; // VdSwap=2 not required at -n 5M; tracked for diagnostic only.
if !exit_ok {
failures += 1;
std::fs::write(
format!("target/parallel-stress-{label}-{run:03}.stdout"),
&out.stdout,
)
.ok();
std::fs::write(
format!("target/parallel-stress-{label}-{run:03}.stderr"),
&out.stderr,
)
.ok();
eprintln!(
"{label}: run {run}/{runs} FAILED (wall={}ms, exit={:?})",
dt,
out.status.code()
);
} else {
eprintln!("{label}: run {run}/{runs} ok (wall={dt}ms)");
}
}
wall_ms.sort();
let p50 = wall_ms[wall_ms.len() / 2];
let p95_idx = ((wall_ms.len() - 1) * 95) / 100;
let p95 = wall_ms[p95_idx];
let max = *wall_ms.last().unwrap();
eprintln!(
"{label} summary: runs={runs} ok={} failed={failures} p50={p50}ms p95={p95}ms max={max}ms",
runs - failures,
);
assert_eq!(failures, 0, "{label}: {failures} of {runs} stress runs failed");
}
/// 20 runs at -n 5M. Session-feasible (~10 minutes at the current
/// perf level). Surfaces lost-wakeup / lock-order / phaser-timeout
/// bugs that a single run wouldn't reproduce.
#[test]
#[ignore = "stress test; run via `cargo test ... -- --ignored parallel_stress_short`"]
fn parallel_stress_short() {
run_stress("short", 20, 5_000_000);
}
/// 100 runs at -n 50M. The full M3 follow-up gate per the master
/// plan. Expected runtime is hours until the perf gap closes.
#[test]
#[ignore = "full stress test; run via `cargo test ... -- --ignored parallel_stress_long`"]
fn parallel_stress_long() {
run_stress("long", 100, 50_000_000);
}