Files
xenia-rs/crates/xenia-app/src/main.rs
MechaCat02 07068e7616 feat(audio): APUBUG-PRODUCER-001 — XAudio register driver client + opt-in callback ticker
Replace the three XAudio kernel-export stubs (Register/Unregister/SubmitFrame)
with canary-faithful implementations and add a periodic buffer-complete
callback ticker reusing the existing SavedCallbackCtx injection machinery.

Canary parity:
- xboxkrnl_audio.cc:56-93 — read callback_ptr[0..1], wrap callback_arg in a
  4-byte big-endian guest heap buffer (`wrapped_callback_arg`), write
  `0x4155_xxxx` to *driver_ptr.
- audio_system.cc:139-141 — guest callback receives r3 = wrapped pointer,
  not raw callback_arg.
- audio_driver.h:21-24 — frame rate 256 samples / 48 kHz ≈ 5.33 ms.

Implementation:
- New `crates/xenia-kernel/src/xaudio.rs` — `XAudioClient`, `XAudioState`
  (8-slot table, pending FIFO, dual-mode ticker), `XAUDIO_INSTR_PERIOD =
  48_000` (lockstep) and `XAUDIO_PERIOD = 5.333 ms` (--parallel), same
  pattern as KRNBUG-D08 v-sync.
- `try_inject_audio_callback` in xenia-app mirrors `try_inject_graphics_interrupt`,
  shares `interrupts.saved` slot for mutex with graphics callbacks.

Gating: ticker + injector run only when `--xaudio-tick` /
`XENIA_XAUDIO_TICK=1`. Default off because Sylpheed's audio callback
enters an infinite `KeWaitForSingleObject` loop on first invocation
(canary's host worker thread provides the buffer-completion fence we
don't model), which hijacks a guest HW thread and regresses
`swaps=2 → 1`. Default-off preserves the lockstep `sylpheed_n*m.json`
goldens exactly.

Producer hunt outcome (FALSIFIED for parked handles 0x1004/0x100c/0x15e4):
at `-n 500M --xaudio-tick` all 3 handles still show
`signal_attempts=0 (primary=0, ghost=0)`. Audio callback is not the
missing producer. Next candidate per audit-findings.md is Timer DPC
delivery (KeSetTimer / KeInsertQueueDpc).

Tests: 562 → 576 green (10 in `xaudio.rs`, 4 in `exports.rs`).
Lockstep `--stable-digest -n 100M` default-off: instructions=100000002,
swaps=2 (matches pre-change baseline byte-for-byte).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 19:50:22 +02:00

3690 lines
149 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
mod observability;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Instant;
use anyhow::Result;
use clap::{Parser, Subcommand, ValueEnum};
use tracing::{debug, info, instrument, warn};
use xenia_kernel::ModuleId;
use xenia_memory::MemoryAccess;
#[derive(Parser)]
#[command(name = "xenia-rs")]
#[command(about = "Xbox 360 emulator for reverse engineering and preservation")]
struct Cli {
/// Force JSON formatting on the console log layer (default is compact text)
#[arg(long, global = true)]
log_json: bool,
/// Additionally write logs to this file. `.json` extension selects JSON;
/// anything else uses text. Uses a non-blocking appender.
#[arg(long, global = true, value_name = "PATH")]
log_file: Option<PathBuf>,
/// Override tracing filter (same syntax as `RUST_LOG`). Also settable via env.
#[arg(long, global = true, value_name = "DIRECTIVES")]
log_filter: Option<String>,
/// Emit a Chrome `about:tracing` JSON trace of all spans to this path.
#[arg(long, global = true, value_name = "PATH")]
trace_chrome: Option<PathBuf>,
/// Start the pprof sampling profiler at 100 Hz. Extension `.svg` writes a
/// flamegraph; `.pb` writes a pprof protobuf. Requires the `profiling`
/// Cargo feature (enabled by default).
#[arg(long, global = true, value_name = "PATH")]
profile: Option<PathBuf>,
#[command(subcommand)]
command: Commands,
}
/// `dis --analyze` modes. Rust passes are the default and always functional;
/// SQL views are an additive surface for relational queries.
#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)]
enum AnalyzeMode {
/// Rust passes only (`func.rs`, `xref.rs`). No SQL views.
Rust,
/// Rust passes + additive SQL views (`v_branch_xrefs`, `v_call_graph`,
/// `v_reachability_from_entry`, `v_function_first_instruction`,
/// `v_imports_called`).
Sql,
/// Same as `sql`, plus a Rust-vs-SQL cross-check on branch xrefs at
/// the end. Disagreement is logged as a warning (non-fatal).
Both,
}
#[derive(Subcommand)]
enum Commands {
/// Disassemble a XEX file from its entry point (or an arbitrary address via `--at`)
Disasm {
/// Path to XEX file
path: String,
/// Number of instructions to disassemble
#[arg(short = 'n', default_value = "64")]
count: usize,
/// Start address (hex with or without `0x` prefix). Defaults to
/// the XEX entry point. Must fall inside the loaded image range.
///
/// Example: `--at 0x824be9a0` to inspect a graphics-interrupt callback.
#[arg(long, value_parser = parse_hex_u32)]
at: Option<u32>,
},
/// Load and execute a XEX file with tracing
Exec {
/// Path to XEX file
path: String,
/// Maximum instructions to execute before stopping (unlimited if omitted)
#[arg(short = 'n')]
max_instructions: Option<u64>,
/// Throttle execution to at most N instructions per second (unlimited if omitted)
#[arg(long)]
ips_limit: Option<u64>,
/// SQLite database to write to. Includes the full static analysis
/// that `dis --db` would produce, plus any opt-in trace tables.
#[arg(long)]
db: Option<String>,
/// Log each executed instruction to the `exec_trace` table
#[arg(long)]
trace_instructions: bool,
/// Log kernel/import calls to the `import_calls` table
#[arg(long)]
trace_imports: bool,
/// Log taken branches (calls, returns, jumps) to the `branch_trace` table
#[arg(long)]
trace_branches: bool,
/// Suppress banners, kernel-call logs, and final register dump
/// (only errors, faults, halts, and the summary line are printed)
#[arg(long)]
quiet: bool,
/// Open a window (winit + wgpu) for interactive dynamic analysis:
/// presents the guest frontbuffer on each VdSwap, polls a host
/// gamepad (gilrs) into XamInputGetState, and renders a live HUD
/// with swap/input stats. Runs the CPU interpreter on a worker
/// thread; the window hosts the event loop on the main thread.
#[arg(long)]
ui: bool,
/// Halt immediately when every live HW thread is blocked on a handle
/// wait and no timer is pending, dumping a per-thread diagnostic
/// (state / PC / LR / waiter handles) instead of force-waking the
/// waiters with STATUS_TIMEOUT. Default is to force-wake so probe
/// runs survive early deadlocks; flip this when investigating the
/// deadlock itself. Also settable via `XENIA_HALT_ON_DEADLOCK=1`.
#[arg(long)]
halt_on_deadlock: bool,
/// Enable per-handle signal/wait/wake audit trail. Diagnostic only:
/// at end-of-run, `dump_thread_diagnostic` prints, for every kernel
/// handle, the bounded ring of recent create/wait/signal/wake events
/// (who fired what, when, with what status). Used to identify
/// missing-signal deadlocks (a handle with waiters but zero signals
/// is the smoking gun for a missing kernel API). Also settable via
/// `XENIA_TRACE_HANDLES=1`.
#[arg(long)]
trace_handles: bool,
/// Comma-separated list of handles to capture **signal attempts**
/// for, even when no `record_create` exists. Distinguishes "guest
/// never called Nt/KeSetEvent on this handle" (signal_attempts=0)
/// from "signal landed but waiter wasn't woken" (>0). Implies
/// `--trace-handles`. Examples:
/// `--trace-handles-focus=0x1004,0x100c,0x15e4,0x42450b5c`
/// `--trace-handles-focus=4100` (decimal; 0x1004)
#[arg(long)]
trace_handles_focus: Option<String>,
/// Force the threaded GPU backend (default at M1.9). Kept for
/// explicit scripted runs; equivalent to omitting both
/// `--gpu-inline` and `--ui`. Settable via `XENIA_GPU_THREAD=1`.
#[arg(long)]
gpu_thread: bool,
/// **M1.9 rollback flag.** Forces the legacy inline GPU backend
/// (synchronous `kernel.gpu.execute_one` on the CPU thread).
/// Use this if the threaded backend regresses; it preserves
/// pre-M1 behavior bit-for-bit. Settable via
/// `XENIA_GPU_INLINE=1`. `--ui` implies `--gpu-inline` until the
/// UI worker is migrated to the Arc-shared mem model.
#[arg(long)]
gpu_inline: bool,
/// **M2.8 — enable the inter-thread reservation table for
/// `lwarx`/`stwcx.`.** Under M3's per-HW-thread parallelism this
/// is the substrate that mediates concurrent `lwarx` claims; in
/// M2's single-threaded mode it's a no-op (the legacy
/// per-`PpcContext` reservation fields drive observable
/// behavior). Settable via `XENIA_RESERVATIONS_TABLE=1`. Default
/// off; M3 flips to default-on for the parallel host-thread
/// path. Verifies the golden digest is unchanged under lockstep
/// mode with the table activated.
#[arg(long)]
reservations_table: bool,
/// **M3 — spawn HW-thread host workers.** When set, wraps
/// `KernelState` in `Arc<Mutex<...>>` and runs the interpreter
/// loop on a dedicated worker thread (the coarse-locked first
/// step toward per-HW-thread parallelism). Lockstep behavior
/// preserved: the worker holds the lock for the whole run, so
/// observable execution is identical. Settable via
/// `XENIA_PARALLEL=1`.
#[arg(long)]
parallel: bool,
/// **APUBUG-PRODUCER-001 diagnostic.** When set, the round
/// prologue ticks the XAudio buffer-complete callback at the
/// audio frame rate and injects via the same SavedCallbackCtx
/// machinery as graphics interrupts. Default off — firing
/// shifts the boot trajectory enough to regress `swaps=2→1`
/// under Sylpheed, which would break the lockstep
/// `sylpheed_n*m.json` goldens. Settable via
/// `XENIA_XAUDIO_TICK=1`.
#[arg(long)]
xaudio_tick: bool,
},
/// Browse XISO disc image contents
Browse {
/// Path to XISO file
path: String,
},
/// Display XEX header information
Info {
/// Path to XEX file
path: String,
},
/// Extract PE image and metadata from a XEX file
Extract {
/// Path to XEX or ISO file
path: String,
/// Output directory (default: same directory as input)
#[arg(short, long)]
output: Option<String>,
/// Write base tables (metadata, sections, imports) to a SQLite database
#[arg(long)]
db: Option<String>,
},
/// Full disassembly with function detection, cross-references, and optional database
Dis {
/// Path to XEX or ISO file
path: String,
/// Output .asm file (default: stdout)
#[arg(short, long)]
output: Option<String>,
/// Output SQLite database (also includes the base extract tables)
#[arg(long)]
db: Option<String>,
/// Output JSON Lines file: one structured row per instruction with
/// section/function/label/branch_target columns. Suitable for
/// `jq`, pandas, or DuckDB's `read_json_auto`.
#[arg(long)]
json: Option<String>,
/// Choose how analysis tables are produced when `--db` is set.
///
/// - `rust` (default): only the Rust passes (`func.rs`, `xref.rs`)
/// populate `functions`/`labels`/`xrefs`. No SQL views.
/// - `sql`: Rust passes still run (function detection and data-ref
/// resolution are Rust-only by design); additive SQL views
/// (`v_branch_xrefs`, `v_call_graph`, `v_reachability_from_entry`,
/// `v_function_first_instruction`, `v_imports_called`) are
/// created on top of the same tables.
/// - `both`: same as `sql`, plus a Rust-vs-SQL cross-check on
/// branch xrefs. Disagreement is logged as a warning (non-fatal).
#[arg(long, value_enum, default_value_t = AnalyzeMode::Rust)]
analyze: AnalyzeMode,
/// Suppress assembly text output (DB-only mode)
#[arg(long)]
quiet: bool,
},
/// P8 — deterministic run digest: execute `n` instructions and print a
/// JSON summary of stats (instructions, draws, swaps, resolves,
/// unique shaders/RTs, interrupts). Drop-in regression detector.
Check {
/// Path to XEX or ISO file
path: String,
/// Instructions to execute before computing the digest
#[arg(short = 'n', default_value = "2000000")]
max_instructions: u64,
/// Write the digest as JSON to this path (stdout if omitted)
#[arg(long)]
out: Option<String>,
/// Optional golden digest JSON; `check` exits non-zero on mismatch
#[arg(long)]
expect: Option<String>,
/// Emit/compare only stable fields (excludes timing-sensitive counters
/// like `packets`, `interrupts_delivered`, `resolves`). Required for any
/// golden captured under `--parallel`; recommended for lockstep goldens
/// at -n ≥50M because `packets` has empirical ±28% jitter from a GPU
/// thread race.
#[arg(long)]
stable_digest: bool,
/// Force the threaded GPU backend (default at M1.9).
#[arg(long)]
gpu_thread: bool,
/// **M1.9 rollback flag.** Forces the legacy inline GPU backend.
/// Use this if the threaded backend regresses.
#[arg(long)]
gpu_inline: bool,
/// M2.8 — enable the inter-thread reservation table.
#[arg(long)]
reservations_table: bool,
/// M3 — spawn HW-thread host workers under coarse Arc<Mutex<KernelState>>.
#[arg(long)]
parallel: bool,
/// APUBUG-PRODUCER-001 diagnostic — enable the XAudio buffer-complete
/// callback ticker. Default off; see the `Exec --xaudio-tick` doc for
/// rationale.
#[arg(long)]
xaudio_tick: bool,
},
}
fn main() -> Result<()> {
let cli = Cli::parse();
// Bump default log level to `warn` for quiet exec runs so kernel-call
// tracing::info! spam is filtered out. RUST_LOG / --log-filter still win.
let exec_quiet = matches!(&cli.command, Commands::Exec { quiet: true, .. });
// Default filter: keep xenia-rs logs, silence noisy third-party crates.
// `wgpu*` / `naga` emit many harmless WARNs (unrecognized present modes,
// GLES-backend config probes, etc.) — we push them to ERROR by default.
// Override via `--log-filter` or `RUST_LOG`, e.g.
// `--log-filter='info,wgpu_core=trace'` during bring-up.
let default_level: &'static str = if exec_quiet {
"warn,\
wgpu=off,wgpu_core=off,wgpu_hal=off,naga=off,\
winit=off,gilrs=off,gilrs_core=off,\
mio=off,polling=off,sctk=off,smithay_client_toolkit=off,\
calloop=off"
} else {
"info,\
wgpu=error,wgpu_core=error,wgpu_hal=error,naga=error,\
winit=warn,gilrs=warn,gilrs_core=warn,\
mio=warn,polling=warn,sctk=warn,smithay_client_toolkit=warn,\
calloop=warn"
};
let config = observability::ObservabilityConfig {
log_json: cli.log_json,
log_file: cli.log_file.clone(),
log_filter: cli.log_filter.clone(),
default_level,
trace_chrome: cli.trace_chrome.clone(),
profile: cli.profile.clone(),
};
// Hold the guards for the whole program; their Drop flushes appenders,
// finalises the Chrome trace, writes the pprof report, and prints the
// metrics summary.
let _obs = observability::init(&config)?;
let result = match cli.command {
Commands::Disasm { path, count, at } => cmd_disasm(&path, count, at),
Commands::Exec {
path,
max_instructions,
ips_limit,
db,
trace_instructions,
trace_imports,
trace_branches,
quiet,
ui,
halt_on_deadlock,
trace_handles,
trace_handles_focus,
gpu_thread,
gpu_inline,
reservations_table,
parallel,
xaudio_tick,
} => cmd_exec(
&path,
max_instructions,
ips_limit,
db.as_deref(),
trace_instructions,
trace_imports,
trace_branches,
quiet,
ui,
halt_on_deadlock,
trace_handles,
trace_handles_focus.as_deref(),
gpu_thread,
gpu_inline,
reservations_table,
parallel,
xaudio_tick,
),
Commands::Browse { path } => cmd_browse(&path),
Commands::Info { path } => cmd_info(&path),
Commands::Extract { path, output, db } => cmd_extract(&path, output.as_deref(), db.as_deref()),
Commands::Dis { path, output, db, json, analyze, quiet } => cmd_dis(&path, output.as_deref(), db.as_deref(), json.as_deref(), analyze, quiet),
Commands::Check {
path,
max_instructions,
out,
expect,
stable_digest,
gpu_thread,
gpu_inline,
reservations_table,
parallel,
xaudio_tick,
} => cmd_check(
&path,
max_instructions,
out.as_deref(),
expect.as_deref(),
stable_digest,
gpu_thread,
gpu_inline,
reservations_table,
parallel,
xaudio_tick,
),
};
match result {
Ok(()) => Ok(()),
Err(e) => Err(observability::attach_span_trace(e)),
}
}
/// Load XEX data from a path. If the path is an ISO, extract default.xex from it.
#[instrument(skip_all, fields(path = %path))]
fn load_xex_data(path: &str) -> Result<Vec<u8>> {
let lower = path.to_lowercase();
if lower.ends_with(".iso") || lower.ends_with(".xiso") {
use xenia_vfs::VfsDevice;
info!("detected disc image, extracting default.xex");
let disc = xenia_vfs::disc_image::DiscImageDevice::open("disc", std::path::Path::new(path))
.map_err(|e| anyhow::anyhow!("Failed to open disc image: {}", e))?;
disc.read_file("default.xex")
.map_err(|e| anyhow::anyhow!("Failed to extract default.xex from disc image: {}", e))
} else {
Ok(std::fs::read(path)?)
}
}
#[instrument(skip_all, fields(path = %path))]
fn cmd_info(path: &str) -> Result<()> {
let started = Instant::now();
let data = load_xex_data(path)?;
let header = xenia_xex::loader::parse_xex2_header(&data)?;
println!("=== XEX2 Header ===");
println!("Magic: {:#010x}", header.magic);
println!("Module Flags: {:#010x}", header.module_flags);
println!("Header Size: {:#x}", header.header_size);
println!("Headers: {}", header.header_count);
if let Some(entry) = xenia_xex::loader::get_entry_point(&header) {
println!("Entry Point: {:#010x}", entry);
}
if let Some(base) = xenia_xex::loader::get_image_base(&header) {
println!("Image Base: {:#010x}", base);
}
println!("\n=== Optional Headers ===");
for h in &header.optional_headers {
println!(" Key: {:#010x} Value: {:#010x}", h.key, h.value);
}
if let Some(ref sec) = header.security_info {
println!("\n=== Security Info ===");
println!("Image Size: {:#x}", sec.image_size);
println!("Load Address: {:#010x}", sec.load_address);
println!("Image Flags: {:#010x}", sec.image_flags);
println!("Page Descs: {}", sec.page_descriptors.len());
}
if let Some(ref ffi) = header.file_format_info {
println!("\n=== File Format ===");
println!("Encryption: {}", match ffi.encryption_type {
0 => "None", 1 => "Normal (AES)", _ => "Unknown"
});
println!("Compression: {}", match ffi.compression_type {
0 => "None", 1 => "Basic", 2 => "Normal (LZX)", _ => "Unknown"
});
if !ffi.basic_blocks.is_empty() {
println!("Basic blocks: {}", ffi.basic_blocks.len());
}
if ffi.normal_window_size != 0 {
println!("LZX Window: {:#x}", ffi.normal_window_size);
}
}
if let Some(ref name) = header.original_pe_name {
println!("\nOriginal PE: {}", name);
}
if let Some(ref ei) = header.execution_info {
println!("\n=== Execution Info ===");
println!("Title ID: {:#010x}", ei.title_id);
println!("Media ID: {:#010x}", ei.media_id);
println!("Disc: {} of {}", ei.disc_number, ei.disc_count);
}
if !header.import_libraries.is_empty() {
println!("\n=== Import Libraries ===");
for lib in &header.import_libraries {
println!(" {} (v{:#010x}, {} imports)", lib.name, lib.version_cur, lib.imports.len());
}
}
info!(wall_ms = started.elapsed().as_millis() as u64, "info complete");
Ok(())
}
/// Clap parser for `--at` — accepts decimal, 0x-prefixed hex, or bare hex.
fn parse_hex_u32(s: &str) -> Result<u32, String> {
let t = s.trim();
let (digits, radix) = if let Some(rest) = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")) {
(rest, 16)
} else if t.chars().all(|c| c.is_ascii_digit()) {
(t, 10)
} else {
(t, 16)
};
u32::from_str_radix(digits, radix)
.map_err(|e| format!("invalid u32 {:?}: {e} (try `0x824be9a0`)", t))
}
#[instrument(skip_all, fields(path = %path, count))]
fn cmd_disasm(path: &str, count: usize, at: Option<u32>) -> Result<()> {
let started = Instant::now();
let data = load_xex_data(path)?;
let header = xenia_xex::loader::parse_xex2_header(&data)?;
let entry = xenia_xex::loader::get_entry_point(&header)
.ok_or_else(|| anyhow::anyhow!("No entry point found in XEX2 header"))?;
let base = xenia_xex::loader::get_image_base(&header)
.ok_or_else(|| anyhow::anyhow!("No image base found in XEX2 header"))?;
info!(entry = format_args!("{:#010x}", entry), base = format_args!("{:#010x}", base), "XEX entry/base");
let image_data = xenia_xex::loader::load_image(&data, &header)?;
info!(bytes = image_data.len(), "image decompressed");
let start = at.unwrap_or(entry);
let label = if at.is_some() { "requested address" } else { "entry point" };
println!("Disassembly from {} {:#010x} ({} instructions):\n", label, start, count);
if start < base {
return Err(anyhow::anyhow!(
"address {:#x} is below image base {:#x}",
start,
base
));
}
let offset = (start - base) as usize;
if offset + count * 4 > image_data.len() {
return Err(anyhow::anyhow!(
"address {:#x} (offset {:#x}) + {} instructions extends past image end ({:#x} bytes)",
start,
offset,
count,
image_data.len()
));
}
let block = xenia_cpu::disasm::disassemble_block(&image_data[offset..], start, count);
for (addr, text) in block {
println!(" {:#010x}: {}", addr, text);
}
info!(wall_ms = started.elapsed().as_millis() as u64, "disasm complete");
Ok(())
}
#[instrument(skip_all, fields(path = %path, ui))]
fn cmd_exec(
path: &str,
max_instructions: Option<u64>,
ips_limit: Option<u64>,
db_path: Option<&str>,
trace_instructions: bool,
trace_imports: bool,
trace_branches: bool,
quiet: bool,
ui: bool,
halt_on_deadlock: bool,
trace_handles: bool,
trace_handles_focus: Option<&str>,
gpu_thread: bool,
gpu_inline: bool,
reservations_table: bool,
parallel: bool,
xaudio_tick: bool,
) -> Result<()> {
cmd_exec_inner(
path,
max_instructions,
ips_limit,
db_path,
trace_instructions,
trace_imports,
trace_branches,
quiet,
ui,
halt_on_deadlock,
trace_handles,
trace_handles_focus,
gpu_thread,
gpu_inline,
reservations_table,
parallel,
xaudio_tick,
None,
None,
false,
)
}
/// P8 — drive a deterministic run and emit a JSON digest of key counters.
/// Shares `cmd_exec`'s setup path so the digest reflects a real execution.
fn cmd_check(
path: &str,
max_instructions: u64,
out: Option<&str>,
expect: Option<&str>,
stable_digest: bool,
gpu_thread: bool,
gpu_inline: bool,
reservations_table: bool,
parallel: bool,
xaudio_tick: bool,
) -> Result<()> {
cmd_exec_inner(
path,
Some(max_instructions),
None,
None,
false,
false,
false,
true, // quiet — suppress banners
false, // ui — headless
false, // halt_on_deadlock — keep the force-wake default for probe/check runs
false, // trace_handles — diagnostic only, never wanted on golden runs
None, // trace_handles_focus — same
gpu_thread,
gpu_inline,
reservations_table,
parallel,
xaudio_tick,
out,
expect,
stable_digest,
)
}
fn cmd_exec_inner(
path: &str,
max_instructions: Option<u64>,
ips_limit: Option<u64>,
db_path: Option<&str>,
trace_instructions: bool,
trace_imports: bool,
trace_branches: bool,
quiet: bool,
ui: bool,
halt_on_deadlock: bool,
trace_handles: bool,
trace_handles_focus: Option<&str>,
gpu_thread: bool,
gpu_inline: bool,
reservations_table: bool,
parallel: bool,
xaudio_tick: bool,
digest_out: Option<&str>,
digest_expect: Option<&str>,
stable_digest: bool,
) -> Result<()> {
let started = Instant::now();
let data = load_xex_data(path)?;
let mut header = xenia_xex::loader::parse_xex2_header(&data)?;
let entry = xenia_xex::loader::get_entry_point(&header)
.ok_or_else(|| anyhow::anyhow!("No entry point found"))?;
let base = xenia_xex::loader::get_image_base(&header)
.ok_or_else(|| anyhow::anyhow!("No image base found"))?;
if let Some(ref ffi) = header.file_format_info {
info!(
compression = match ffi.compression_type {
0 => "none", 1 => "basic", 2 => "normal (LZX)", _ => "unknown"
},
encryption = match ffi.encryption_type {
0 => "none", 1 => "normal (AES)", _ => "unknown"
},
"XEX file format"
);
}
for lib in &header.import_libraries {
debug!(name = %lib.name, imports = lib.imports.len(), "import library");
}
info!(
entry = format_args!("{:#010x}", entry),
base = format_args!("{:#010x}", base),
"loading XEX"
);
// Allocate guest memory
let mut mem = xenia_memory::GuestMemory::new()
.map_err(|e| anyhow::anyhow!("Failed to allocate guest memory: {}", e))?;
// Load and decompress the XEX image
let image_data = xenia_xex::loader::load_image(&data, &header)?;
// Resolve import ordinals from PE image
xenia_xex::loader::resolve_imports(&mut header, &image_data);
let alloc_size = ((image_data.len() + 4095) & !4095) as u32;
let rw = xenia_memory::page_table::MemoryProtect::READ
| xenia_memory::page_table::MemoryProtect::WRITE;
mem.alloc(base, alloc_size, rw)
.map_err(|e| anyhow::anyhow!("Failed to allocate guest memory region: {}", e))?;
mem.write_bulk(base, &image_data);
// ── Phase 1: Build import thunk map ──────────────────────────────────
let mut thunk_map: HashMap<u32, (ModuleId, u16, String)> = HashMap::new();
// Reverse map for `XexGetProcedureAddress` (drained into kernel below).
let mut thunk_addr_map: Vec<(ModuleId, u16, u32)> = Vec::new();
for lib in &header.import_libraries {
let module = match lib.name.as_str() {
"xboxkrnl.exe" => ModuleId::Xboxkrnl,
"xam.xex" => ModuleId::Xam,
_ => continue,
};
for imp in &lib.imports {
if imp.record_type == 1 {
let name = xenia_analysis::resolve_ordinal(&lib.name, imp.ordinal)
.map(|s| s.to_string())
.unwrap_or_else(|| format!("ordinal_{:#06X}", imp.ordinal));
thunk_map.insert(imp.address, (module, imp.ordinal, name));
thunk_addr_map.push((module, imp.ordinal, imp.address));
}
}
}
info!(thunks = thunk_map.len(), "import thunks mapped");
// ── Phase 2: CPU initialization per xenia-canary ─────────────────────
// Allocate stack (1MB at 0x70000000)
let stack_base = 0x7000_0000u32;
let stack_size = 0x10_0000u32;
mem.alloc(stack_base, stack_size, rw)
.map_err(|e| anyhow::anyhow!("Failed to allocate stack: {}", e))?;
// Allocate PCR (Processor Control Region) and TLS
let pcr_addr = 0x7FFF_0000u32;
let tls_addr = 0x7FFE_0000u32;
mem.alloc(pcr_addr, 0x1000, rw)?;
mem.alloc(tls_addr, 0x1000, rw)?;
// Initialize PCR structure
mem.write_u32(pcr_addr, tls_addr); // PCR->tls_ptr
mem.write_u32(pcr_addr + 0x100, 0x1000); // PCR->current_thread (fake)
mem.write_u32(pcr_addr + 0x150, 0); // PCR->dpc_active
// Set up CPU context per xenia-canary/cpu/thread_state.cc.
//
// `PpcContext::new` already initializes LR = 0xBCBC_BCBC (halt sentinel),
// VSCR NJ bit, and VRSAVE = 0xFFFFFFFF to match canary.
let mut ctx = xenia_cpu::PpcContext::new();
ctx.pc = entry;
// Stack pointer: 16-byte aligned, leaving 256 bytes of reservation above
// the top so the entry's prolog has room for its linkage area without
// wandering off the allocated region.
let sp_top = (stack_base + stack_size) as u64;
ctx.gpr[1] = (sp_top - 0x100) & !0xFu64;
ctx.gpr[2] = 0x2000_0000; // RTOC (TOC base)
// r3..r7 carry the XEX "start_context" in canary; for a bare entry (no
// launch data from the kernel), canary passes 0. Explicitly zero them so
// we don't inherit whatever PpcContext::new happens to leave.
for r in 3..=7 { ctx.gpr[r] = 0; }
ctx.gpr[13] = pcr_addr as u64; // PCR/TLS pointer (r13)
ctx.msr = 0x9030; // Hardware-dumped MSR
// ── Phase 4: Set up kernel ───────────────────────────────────────────
//
// M1.9 — default GPU backend is now **threaded**. Rollback rules:
// 1. `--gpu-inline` / `XENIA_GPU_INLINE=1` → force inline.
// 2. `--ui` → force inline (the UI
// worker thread doesn't yet share `Arc<GuestMemory>` with the
// GPU worker; cohabitation panics in `cmd_exec_inner` below).
// 3. `--gpu-thread` / `XENIA_GPU_THREAD=1` → force threaded
// (explicit override).
// 4. Otherwise: threaded.
let env_inline = std::env::var("XENIA_GPU_INLINE")
.ok()
.is_some_and(|v| {
let v = v.trim().to_ascii_lowercase();
v == "1" || v == "true" || v == "yes"
});
let env_thread = std::env::var("XENIA_GPU_THREAD")
.ok()
.is_some_and(|v| {
let v = v.trim().to_ascii_lowercase();
v == "1" || v == "true" || v == "yes"
});
let force_inline = gpu_inline || env_inline || ui;
let force_thread = gpu_thread || env_thread;
let use_threaded = if force_inline {
false
} else if force_thread {
true
} else {
true // M1.9 default
};
let shutdown_arc =
std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
let (mut maybe_gpu_worker, gpu_backend): (
Option<xenia_gpu::GpuWorker>,
xenia_gpu::GpuBackend,
) = if use_threaded {
let (worker, handle) = xenia_gpu::GpuSystem::new()
.into_handle_with_shutdown(shutdown_arc.clone());
(Some(worker), xenia_gpu::GpuBackend::Threaded(handle))
} else {
(
None,
xenia_gpu::GpuBackend::Inline(xenia_gpu::GpuSystem::new()),
)
};
let mut kernel = xenia_kernel::KernelState::with_gpu(gpu_backend);
kernel.image_base = base;
// Drain the reverse thunk map into the kernel so `XexGetProcedureAddress`
// can resolve ordinals back to callable thunk addresses.
for (module, ordinal, addr) in thunk_addr_map.drain(..) {
kernel.register_thunk(module, ordinal, addr);
}
// M2.8 — flip the reservation-table flag if requested. Either
// `--reservations-table` or `XENIA_RESERVATIONS_TABLE=1`. The flag
// is observable via `kernel.reservations_enabled.load(Acquire)`;
// M3 will hook it into the interpreter's `lwarx`/`stwcx.` arms.
let reservations_via_env = std::env::var("XENIA_RESERVATIONS_TABLE")
.ok()
.is_some_and(|v| {
let v = v.trim().to_ascii_lowercase();
v == "1" || v == "true" || v == "yes"
});
let parallel_via_env = std::env::var("XENIA_PARALLEL")
.ok()
.is_some_and(|v| {
let v = v.trim().to_ascii_lowercase();
v == "1" || v == "true" || v == "yes"
});
let parallel_active = parallel || parallel_via_env;
kernel.parallel_active = parallel_active;
let xaudio_tick_via_env = std::env::var("XENIA_XAUDIO_TICK")
.ok()
.is_some_and(|v| {
let v = v.trim().to_ascii_lowercase();
v == "1" || v == "true" || v == "yes"
});
kernel.xaudio_tick_enabled = xaudio_tick || xaudio_tick_via_env;
if kernel.xaudio_tick_enabled && !quiet {
tracing::info!(
"XAudio callback ticker enabled (--xaudio-tick / XENIA_XAUDIO_TICK=1)"
);
}
if reservations_table || reservations_via_env || parallel_active {
kernel.reservations.enable();
if !quiet {
tracing::info!("reservation table enabled (lwarx/stwcx route through it)");
}
}
// Diagnostic: flip audit on when --trace-handles or XENIA_TRACE_HANDLES.
// Behaviour-neutral when disabled (every record_* method is a no-op).
let audit_via_env = std::env::var("XENIA_TRACE_HANDLES")
.map(|v| matches!(v.as_str(), "1" | "true" | "TRUE" | "True"))
.unwrap_or(false);
let trace_handles_active = trace_handles || audit_via_env || trace_handles_focus.is_some();
if trace_handles_active {
kernel.audit.enabled = true;
if !quiet {
tracing::info!("handle audit enabled (--trace-handles)");
}
}
// KRNBUG-AUDIT-001: parse `--trace-handles-focus=0x1004,0x100c,...`
// into the audit focus set. When non-empty, signal attempts on these
// handles are captured into ghost trails even if no `record_create`
// exists for them — the data-driven hypothesis fail-fast for
// parked-waiter producer-trace.
if let Some(list) = trace_handles_focus {
for token in list.split(',').map(str::trim).filter(|s| !s.is_empty()) {
let parsed = if let Some(hex) = token.strip_prefix("0x").or_else(|| token.strip_prefix("0X")) {
u32::from_str_radix(hex, 16)
} else {
token.parse::<u32>()
};
match parsed {
Ok(h) => {
kernel.audit.focus.insert(h);
}
Err(_) => {
return Err(anyhow::anyhow!(
"invalid handle in --trace-handles-focus: {token:?}"
));
}
}
}
if !quiet && !kernel.audit.focus.is_empty() {
let focused: Vec<String> = kernel
.audit
.focus
.iter()
.map(|h| format!("{h:#010x}"))
.collect();
tracing::info!(
"handle audit focus: {} ({})",
kernel.audit.focus.len(),
focused.join(", ")
);
}
}
// Install the GPU register aperture MMIO region on the guest memory so
// any `0x7FC8xxxx` access routes to our atomic mailbox. Matches canary's
// `graphics_system.cc:141-144`. The callbacks capture Arc clones of the
// atoms that live inside `kernel.gpu.mmio`.
mem.add_mmio_region(xenia_gpu::build_mmio_region(kernel.gpu.mmio()));
// Install the initial guest thread on HW slot 0. The thread handle we
// hand the scheduler isn't visible to any guest API yet, but joiners
// (XThreadWait-style) will see it via `find_by_tid`.
let main_handle = kernel.alloc_handle_for(xenia_kernel::objects::KernelObject::Thread {
id: xenia_cpu::scheduler::INITIAL_GUEST_TID,
hw_id: Some(0),
exit_code: None,
waiters: Vec::new(),
});
kernel.next_thread_id.store(
xenia_cpu::scheduler::INITIAL_GUEST_TID + 1,
std::sync::atomic::Ordering::Relaxed,
);
// PCR[+0x2C] = processor number 0 for the main HW thread.
mem.write_u32(pcr_addr + 0x2C, 0);
ctx.thread_id = xenia_cpu::scheduler::INITIAL_GUEST_TID;
kernel.install_initial_thread(
ctx,
stack_base,
stack_size,
pcr_addr,
tls_addr,
main_handle,
&mut mem,
);
// If the input was a disc image, mount it so the kernel's file I/O
// handlers can serve the game's own assets via VFS.
if path.to_lowercase().ends_with(".iso") || path.to_lowercase().ends_with(".xiso") {
match xenia_vfs::disc_image::DiscImageDevice::open("d", std::path::Path::new(path)) {
Ok(disc) => kernel.vfs = Some(Box::new(disc)),
Err(e) => tracing::warn!("Could not mount disc image for VFS: {}", e),
}
}
// ── Phase 3: Data export patching (variable imports) ─────────────────
//
// Helper: lazily allocate a zero-filled block on the heap. Shared for all
// diagnostic data exports the game reads but never writes — a single
// block keeps allocator noise low.
let alloc_zero = |size: u32, mem: &xenia_memory::GuestMemory, kernel: &mut xenia_kernel::KernelState| -> u32 {
kernel.heap_alloc(size, mem).unwrap_or(0)
};
for lib in &header.import_libraries {
for imp in &lib.imports {
if imp.record_type != 0 { continue; } // Only variable entries
let addr = imp.address;
match (lib.name.as_str(), imp.ordinal) {
("xboxkrnl.exe", 0x001B) => {
// ExThreadObjectType — ptr to OBJECT_TYPE descriptor (0x40 bytes)
let block = alloc_zero(0x40, &mut mem, &mut kernel);
mem.write_u32(addr, block);
}
("xboxkrnl.exe", 0x0059) => {
// KeDebugMonitorData — ptr to diagnostic block
let block = alloc_zero(0x40, &mut mem, &mut kernel);
mem.write_u32(addr, block);
}
("xboxkrnl.exe", 0x00AD) => {
// KeTimeStampBundle — 0x18 block with FILETIME at +0 and
// interrupt-time u64 at +0x10. Mirrors the clock used by
// KeQuerySystemTime so fast-path readers see consistent values.
let block = alloc_zero(0x18, &mut mem, &mut kernel);
if block != 0 {
let fake_time: u64 = 132_500_000_000_000_000; // ~2021 FILETIME
mem.write_u32(block, (fake_time >> 32) as u32);
mem.write_u32(block + 4, fake_time as u32);
mem.write_u32(block + 0x10, (fake_time >> 32) as u32);
mem.write_u32(block + 0x14, fake_time as u32);
}
mem.write_u32(addr, block);
}
("xboxkrnl.exe", 0x0158) => {
// XboxKrnlVersion: {major=2, minor=0, build=20000, qfe=0}
mem.write_u16(addr, 2);
mem.write_u16(addr + 2, 0);
mem.write_u16(addr + 4, 20000);
mem.write_u16(addr + 6, 0);
}
("xboxkrnl.exe", 0x0193) => {
// XexExecutableModuleHandle -> image base
mem.write_u32(addr, base);
}
("xboxkrnl.exe", 0x01AE) => {
// ExLoadedCommandLine — ANSI empty string
let block = alloc_zero(0x10, &mut mem, &mut kernel);
// Block is already zero-initialized by heap_alloc -> empty string.
mem.write_u32(addr, block);
}
("xboxkrnl.exe", 0x01BE) => {
// VdGlobalDevice — passed through to Vd* shims. Write 0.
mem.write_u32(addr, 0);
}
("xboxkrnl.exe", 0x01C0) => {
// VdGpuClockInMHz
mem.write_u32(addr, 500);
}
("xboxkrnl.exe", 0x01C1) => {
// VdHSIOCalibrationLock — lock slot
mem.write_u32(addr, 0);
}
("xboxkrnl.exe", 0x0266) => {
// KeCertMonitorData — ptr to diagnostic block
let block = alloc_zero(0x100, &mut mem, &mut kernel);
mem.write_u32(addr, block);
}
_ => {
// All other variable exports: write 0
mem.write_u32(addr, 0);
}
}
}
}
// ── Phase 5: Set up SQLite DB with full static analysis + opt-in traces ──
let mut db_writer: Option<xenia_analysis::DbWriter> = None;
if let Some(db) = db_path {
use std::collections::HashMap;
let sections = xenia_xex::pe::parse_sections(&image_data)?;
// Build import address -> name map (for xref analysis)
let mut import_map: HashMap<u32, String> = HashMap::new();
for lib in &header.import_libraries {
for imp in &lib.imports {
let resolved = xenia_analysis::resolve_ordinal(&lib.name, imp.ordinal);
let name = match resolved {
Some(n) => format!("{}::{}", lib.name, n),
None => format!("{}::ordinal_{:#06X}", lib.name, imp.ordinal),
};
import_map.insert(imp.address, name);
}
}
// Function + xref analysis
let code_sections: Vec<(u32, u32, u32)> = sections.iter()
.filter(|s| s.is_code())
.map(|s| (s.virtual_address, s.virtual_size, s.flags))
.collect();
let func_analysis = xenia_analysis::func::analyze(&image_data, base, entry, &code_sections);
info!(functions = func_analysis.functions.len(), "function detection complete");
let xref_result = xenia_analysis::xref::analyze_xrefs(
&image_data, base, entry, &sections, &func_analysis, &import_map,
);
let total_xrefs: usize = xref_result.xrefs.values().map(|v| v.len()).sum();
info!(
labels = xref_result.labels.len(),
xrefs = total_xrefs,
"xref analysis complete"
);
let disasm_info = xenia_analysis::formatter::DisasmInfo {
image_base: base,
entry_point: entry,
original_pe_name: header.original_pe_name.as_deref(),
title_id: header.execution_info.as_ref().map(|e| e.title_id),
media_id: header.execution_info.as_ref().map(|e| e.media_id),
sections: &sections,
import_libraries: &header.import_libraries,
};
info!(db = %db, "writing database");
let mut w = xenia_analysis::DbWriter::open_fresh(std::path::Path::new(db))?;
w.write_base(&disasm_info)?;
w.write_disasm(&image_data, &disasm_info, &func_analysis, &xref_result.labels, &xref_result.xrefs)?;
w.prepare_trace_tables(trace_instructions, trace_imports, trace_branches)?;
db_writer = Some(w);
}
// Set up debugger.
//
// Tier-3 perf: `trace_enabled = false` by default. The in-memory
// `trace_log: Vec<TraceEntry>` was previously populated on every
// instruction; once it filled to 100k entries each subsequent step
// performed a `Vec::remove(0)` — O(n) shifts per instruction. At
// 2M+ instructions this pathological behavior dominated run time
// (~300s of the ~600s we observed for 5M instructions, consistent
// with ~10 GB/s memory bandwidth × n²/2 shift cost).
//
// Nothing outside the debugger REPL (not shipped) reads the log, so
// the default is safe to flip. `--trace-instructions` routes full
// instruction traces to the DB writer instead, which is the correct
// path for analysis; we keep that behaviour unchanged.
let mut debugger = xenia_debugger::Debugger::new();
debugger.paused = false;
debugger.step_mode = xenia_debugger::StepMode::Run;
debugger.trace_enabled = false;
// DB-based trace still needs the in-memory trace off — we only want
// one authoritative stream. `trace_instructions` implies the DB
// writer pipeline is the recorder; leave `trace_enabled` false.
let _ = trace_instructions;
match max_instructions {
Some(n) => info!(limit = n, "starting execution"),
None => info!("starting execution (unlimited)"),
}
if let Some(ips) = ips_limit {
info!(ips_limit = ips, "throttling execution to instructions/second");
}
// ── Phase 6: Execution ───────────────────────────────────────────────
// Either run on the main thread (headless), or spawn a worker and drive
// the winit event loop on the main thread (--ui).
// M1.4 — wrap `mem` in an `Arc<GuestMemory>` after all init mutations
// are complete. The worker thread (if spawned below) holds its own
// Arc clone for the duration of the run; the CPU side passes
// `&*mem_arc` (= `&GuestMemory`) into `run_execution`. The
// trait-level invariant carrying this is correctness: writes are
// `&self` post the M1.4(b) trait flip, and `bump_page_version` is
// atomic, so concurrent CPU/GPU accesses to disjoint byte ranges are
// sound. (Same-byte concurrent access remains the caller's
// responsibility per the trait contract.)
let mem_arc = std::sync::Arc::new(mem);
// Spawn the real GPU worker if the threaded backend was chosen at
// kernel-construction time. The handle the kernel already holds
// (`GpuBackend::Threaded`) is the CPU-side proxy; the worker owns
// the actual `GpuSystem` and runs its `run()` loop — see
// `xenia_gpu::handle::GpuWorker::run` for the concurrency model.
// M1.3's `spawn_noop_worker` is now superseded for the threaded
// path; the no-op helper is retained for unit tests.
let gpu_thread_resources = if let Some(worker) = maybe_gpu_worker.take() {
info!("gpu: threaded backend — spawning worker thread");
let join = xenia_gpu::spawn_gpu_worker(worker, mem_arc.clone());
Some((shutdown_arc.clone(), join))
} else {
None
};
// M1.7: capture the worker-thread handle before the dispatch so the
// shutdown path can unpark it without holding `kernel` (the UI path
// moves `kernel` into `run_with_ui`). In headless+threaded mode the
// kernel survives the dispatch, but extracting here also covers the
// hypothetical UI+threaded mix once that ergonomic landmine is
// resolved.
let worker_thread_handle = if gpu_thread_resources.is_some() {
Some(kernel.gpu.mmio().worker_thread.clone())
} else {
None
};
let result = if ui {
run_with_ui(
path,
// `run_with_ui` consumes `GuestMemory` by value today; M1.4
// keeps that path on the inline backend until the UI worker
// is migrated to the Arc-shared model. Recover ownership via
// `Arc::try_unwrap` — succeeds because the GPU worker is not
// spawned in inline mode (`maybe_gpu_worker` is `None`).
std::sync::Arc::try_unwrap(mem_arc).unwrap_or_else(|_| {
panic!(
"M1.4: --ui + --gpu-thread cohabitation not yet wired; \
choose one"
)
}),
kernel,
debugger,
thunk_map,
db_writer,
max_instructions,
ips_limit,
quiet,
halt_on_deadlock,
started,
)
} else {
// Wrap the headless body in a closure so any early returns
// (digest mismatch, db finalize error) flow through the GPU
// thread shutdown below rather than leaking the worker.
(|| -> Result<()> {
// M3 — opt-in `--parallel` flag spawns the interpreter on a
// dedicated worker thread under `Arc<Mutex<KernelState>>`.
// The first M3 milestone delivers the spawn substrate (N=1
// worker, kernel wrapped in Arc<Mutex<>>); the actual
// per-HW-thread parallel spawn is the follow-up. With N=1
// and the worker holding the lock for the run's duration,
// observable execution is bit-identical to lockstep mode —
// golden must still match.
let env_parallel = std::env::var("XENIA_PARALLEL")
.ok()
.is_some_and(|v| {
let v = v.trim().to_ascii_lowercase();
v == "1" || v == "true" || v == "yes"
});
let do_parallel = parallel || env_parallel;
// Step 04 gate: --parallel runs N=6 workers that release
// the kernel mutex around step_block, so the per-instruction
// observation path (debugger hooks, DB writer, force-per-instr)
// would either contend severely or deadlock. Reject the
// combination at the boundary with a clean error.
if do_parallel {
if debugger.wants_hooks() {
anyhow::bail!(
"--parallel is incompatible with debugger hooks; use lockstep mode"
);
}
if db_writer.is_some() {
anyhow::bail!(
"--parallel is incompatible with --db / DB writer; use lockstep mode"
);
}
if std::env::var("XENIA_FORCE_PER_INSTR")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false)
{
anyhow::bail!(
"--parallel is incompatible with XENIA_FORCE_PER_INSTR=1; use lockstep mode"
);
}
}
let stats = if do_parallel {
if !quiet {
info!(
"M3 — spawning N=6 worker threads + main coordinator (7-party phaser)",
);
}
let kernel_arc = std::sync::Arc::new(std::sync::Mutex::new(kernel));
let mem_for_worker = mem_arc.clone();
let kernel_for_worker = kernel_arc.clone();
// Move debugger + db_writer + thunk_map into the worker;
// the worker returns them so the caller can resume
// post-run analysis (digest, summary, diagnostic dump).
let max_instructions_w = max_instructions;
let ips_limit_w = ips_limit;
let quiet_w = quiet;
let halt_w = halt_on_deadlock;
let mut debugger_w = debugger;
let mut db_writer_w = db_writer;
let thunk_map_w = thunk_map;
let join = std::thread::Builder::new()
.name("xenia-cpu-host".into())
.spawn(move || -> (
ExecStats,
xenia_debugger::Debugger,
Option<xenia_analysis::DbWriter>,
std::collections::HashMap<u32, (ModuleId, u16, String)>,
) {
// M3 real-par Step 03: per-round drop-and-reacquire
// around step_block. Worker no longer holds the
// mutex for the run's duration; `run_execution_parallel`
// owns the locking dance internally.
let stats = run_execution_parallel(
&*mem_for_worker,
&kernel_for_worker,
&mut debugger_w,
&thunk_map_w,
db_writer_w.as_mut(),
max_instructions_w,
ips_limit_w,
quiet_w,
halt_w,
None,
);
(stats, debugger_w, db_writer_w, thunk_map_w)
})
.expect("spawn xenia-cpu-host");
let (stats, debugger_back, db_writer_back, thunk_map_back) =
join.join().expect("xenia-cpu-host worker panicked");
debugger = debugger_back;
db_writer = db_writer_back;
let _ = thunk_map_back; // keep alive via shadowing already handled
// Recover the kernel from the Arc<Mutex<>>. The worker
// dropped its clone on return; only one strong ref
// remains, so `try_unwrap` succeeds.
kernel = std::sync::Arc::try_unwrap(kernel_arc)
.map_err(|_| anyhow::anyhow!("M3: kernel Arc still has clones"))?
.into_inner()
.map_err(|_| anyhow::anyhow!("M3: kernel mutex poisoned"))?;
stats
} else {
run_execution(
&*mem_arc,
&mut kernel,
&mut debugger,
&thunk_map,
db_writer.as_mut(),
max_instructions,
ips_limit,
quiet,
halt_on_deadlock,
None,
)
};
// After the conditional bound `stats`, `kernel`,
// `debugger`, `db_writer` above, post-run analysis continues
// unchanged: db finalize, print_summary, dump_thread_diagnostic,
// run digest emit + golden compare.
if let Some(ref mut db) = db_writer {
db.finalize_traces()?;
}
print_summary(kernel.scheduler.ctx(0), &debugger, &db_writer, quiet);
dump_thread_diagnostic(&kernel, quiet);
info!(
wall_ms = started.elapsed().as_millis() as u64,
instructions = stats.instruction_count,
import_calls = stats.import_count,
unimplemented = stats.unimpl_count,
"exec complete"
);
// P8 — optional run-digest emission. Built from the live
// `kernel.gpu.stats` + `ExecStats` after the run completes;
// stable ordering + stable field names so byte-level JSON diffs
// catch any drift between runs.
if digest_out.is_some() || digest_expect.is_some() {
let digest = RunDigest::capture(path, &kernel, &stats);
let json = if stable_digest {
digest.stable_fields_json()
} else {
digest.to_json()
};
if let Some(out_path) = digest_out {
std::fs::write(out_path, &json)?;
info!(out = out_path, stable = stable_digest, "run digest written");
} else {
println!("{json}");
}
if let Some(expect_path) = digest_expect {
let expected = std::fs::read_to_string(expect_path)
.map_err(|e| anyhow::anyhow!("failed to read expected digest: {e}"))?;
if json.trim() != expected.trim() {
eprintln!("DIGEST MISMATCH vs {expect_path}");
eprintln!("--- expected ---");
eprintln!("{}", expected.trim());
eprintln!("--- actual ---");
eprintln!("{}", json.trim());
return Err(anyhow::anyhow!(
"run digest differs from golden; see diff above"
));
}
info!("run digest matches golden");
}
}
Ok(())
})()
};
// M1.7 — signal the GPU worker thread to exit and join it with a
// 1 s defensive timeout. We also `unpark()` it via the MMIO
// worker-thread handle so it observes shutdown immediately rather
// than waiting up to `WORKER_PARK_TIMEOUT` (16 ms) on its own. The
// helper logs at `error!` if the join blows the budget; we emit a
// `warn!` here as a second-level breadcrumb tied to this specific
// run. We do NOT propagate the timeout as an error: process
// teardown is more important than blocking on a stuck thread.
if let Some((shutdown, join)) = gpu_thread_resources {
// Wake the parker so the worker observes `shutdown` on its next
// loop iteration without waiting for `park_timeout`.
if let Some(handle_arc) = worker_thread_handle {
if let Ok(g) = handle_arc.lock() {
if let Some(t) = g.as_ref() {
t.unpark();
}
}
}
if xenia_gpu::shutdown_and_join_with_timeout(
&shutdown,
join,
std::time::Duration::from_secs(1),
)
.is_err()
{
warn!("GPU worker thread did not exit in 1s; continuing teardown");
}
}
result
}
/// Result of one execution pass — used by both the headless path and the
/// worker thread behind `--ui`.
#[derive(Default)]
struct ExecStats {
instruction_count: u64,
import_count: u64,
unimpl_count: u64,
}
/// P8 — deterministic run digest. Emitted as stable-ordered JSON by
/// `xenia-rs check`. Every field is a u64 counter; the file is trivially
/// diffable via `diff` / `cmp`. New counters are appended at the bottom
/// so golden files authored today still match tomorrow's schema minus
/// any new lines. Semver-wise this is a "pre-1.0" API — regressions bail
/// on first mismatch, no field-wise tolerance.
struct RunDigest {
path: String,
instructions: u64,
imports: u64,
unimpl: u64,
packets: u64,
draws: u64,
swaps: u64,
resolves: u64,
unique_render_targets: u64,
shader_blobs_live: u64,
interrupts_delivered: u64,
interrupts_dropped: u64,
texture_cache_entries: u64,
texture_decodes: u64,
}
impl RunDigest {
fn capture(path: &str, kernel: &xenia_kernel::KernelState, stats: &ExecStats) -> Self {
// M1.4: route stats through the backend snapshot accessor. Inline
// mode reads directly from the live `GpuSystem`; threaded mode
// reads from the worker's published `Arc<Mutex<GpuDigestSnapshot>>`
// (refreshed each outer loop iteration on the worker thread).
let snap = kernel.gpu.digest_snapshot();
Self {
path: path.to_string(),
instructions: stats.instruction_count,
imports: stats.import_count,
unimpl: stats.unimpl_count,
packets: snap.stats.packets_executed,
draws: snap.stats.draws_seen,
swaps: snap.stats.swaps_seen,
resolves: snap.stats.resolves_total,
unique_render_targets: snap.stats.unique_render_targets,
shader_blobs_live: snap.shader_blobs_live,
interrupts_delivered: kernel.interrupts.delivered,
interrupts_dropped: kernel.interrupts.dropped,
texture_cache_entries: snap.texture_cache_entries,
texture_decodes: snap.texture_decodes,
}
}
fn to_json(&self) -> String {
// Hand-rolled deterministic ordering — alphabetical-ish + keys
// quoted. No external dep, no field ambiguity.
format!(
"{{\n \"path\": \"{}\",\n \
\"instructions\": {},\n \
\"imports\": {},\n \
\"unimpl\": {},\n \
\"packets\": {},\n \
\"draws\": {},\n \
\"swaps\": {},\n \
\"resolves\": {},\n \
\"unique_render_targets\": {},\n \
\"shader_blobs_live\": {},\n \
\"interrupts_delivered\": {},\n \
\"interrupts_dropped\": {},\n \
\"texture_cache_entries\": {},\n \
\"texture_decodes\": {}\n}}\n",
escape_json_string(&self.path),
self.instructions,
self.imports,
self.unimpl,
self.packets,
self.draws,
self.swaps,
self.resolves,
self.unique_render_targets,
self.shader_blobs_live,
self.interrupts_delivered,
self.interrupts_dropped,
self.texture_cache_entries,
self.texture_decodes,
)
}
/// Stable-fields-only digest for goldens that need to survive non-determinism.
/// Excludes timing-sensitive counters: `packets` has documented ±2.58% lockstep
/// noise from a GPU thread race; `resolves`, `interrupts_delivered`,
/// `interrupts_dropped`, and `texture_decodes` are scheduling-sensitive under
/// `--parallel`. Also omits `path` (cwd-dependent). The remaining fields are
/// deterministic in lockstep at a fixed instruction budget. Use via
/// `--stable-digest`.
fn stable_fields_json(&self) -> String {
format!(
"{{\n \"instructions\": {},\n \
\"imports\": {},\n \
\"unimpl\": {},\n \
\"draws\": {},\n \
\"swaps\": {},\n \
\"unique_render_targets\": {},\n \
\"shader_blobs_live\": {},\n \
\"texture_cache_entries\": {}\n}}\n",
self.instructions,
self.imports,
self.unimpl,
self.draws,
self.swaps,
self.unique_render_targets,
self.shader_blobs_live,
self.texture_cache_entries,
)
}
}
fn escape_json_string(s: &str) -> String {
s.replace('\\', r"\\").replace('"', r#"\""#)
}
/// Round-control signal returned by the per-round coordinator helpers.
/// `BreakOuter` halts the scheduler loop; `Continue` resumes the outer
/// loop (either to step the round or to retry after an idle-advance).
#[derive(Debug)]
enum RoundCtl {
BreakOuter,
Continue,
}
/// Per-round prologue: max-instruction budget check, IPS throttle, UI
/// shutdown poll, heartbeat log, vsync ticker (with d1mode bit set when
/// the vsync fires), pending-timer fire, graphics-interrupt injection.
/// Returns `BreakOuter` only when the run hits its budget or the UI
/// asks for shutdown.
fn coord_pre_round(
kernel: &mut xenia_kernel::KernelState,
stats: &ExecStats,
max_instructions: Option<u64>,
ips_limit: Option<u64>,
throttle_start: Instant,
shutdown: &Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
) -> RoundCtl {
const SHUTDOWN_CHECK_MASK: u64 = 0x1FFF;
const HEARTBEAT_MASK: u64 = 0xF_FFFF; // ~1M instructions
if let Some(limit) = max_instructions
&& stats.instruction_count >= limit
{
info!(limit, "reached max instruction count");
return RoundCtl::BreakOuter;
}
if let Some(ips) = ips_limit
&& ips > 0
&& stats.instruction_count > 0
{
let expected_ns = (stats.instruction_count as u128)
.saturating_mul(1_000_000_000u128)
/ ips as u128;
let actual_ns = throttle_start.elapsed().as_nanos();
if actual_ns < expected_ns {
let sleep_ns = (expected_ns - actual_ns).min(u64::MAX as u128) as u64;
std::thread::sleep(std::time::Duration::from_nanos(sleep_ns));
}
}
if (stats.instruction_count & SHUTDOWN_CHECK_MASK) == 0 {
if let Some(flag) = shutdown
&& flag.load(std::sync::atomic::Ordering::Relaxed)
{
info!(instructions = stats.instruction_count, "UI requested shutdown");
return RoundCtl::BreakOuter;
}
if let Some(ref ui) = kernel.ui {
ui.instructions_counter
.store(stats.instruction_count, std::sync::atomic::Ordering::Relaxed);
}
}
if (stats.instruction_count & HEARTBEAT_MASK) == 0
&& stats.instruction_count > 0
&& tracing::enabled!(tracing::Level::DEBUG)
{
let t = kernel.scheduler.ctx(0);
debug!(
cycle = stats.instruction_count,
pc = format_args!("{:#010x}", t.pc),
lr = format_args!("{:#010x}", t.lr as u32),
sp = format_args!("{:#010x}", t.gpr[1] as u32),
"cpu heartbeat"
);
}
// KRNBUG-D08: backend-aware v-sync ticker.
//
// **Lockstep**: instruction-count ticker (deterministic; one tick per
// PPC block boundary, predictable cadence). The cadence drifts a bit
// from real 60 Hz but is bit-stable across runs, which matters for
// the `sylpheed_n*m.json` golden oracles.
//
// **--parallel**: wall-clock ticker. The instruction-count proxy
// dropped from 629 v-syncs/100M lockstep to ~2 under `--parallel`
// (audit M11) because the dispatcher executes more PPC instructions
// per tick callback when 6 worker threads share the kernel mutex,
// so the accumulator never crosses the 150k threshold. Wall-clock
// restores the ~60 Hz rate at the cost of bit-exact run reproducibility,
// which is acceptable under `--parallel` (M11 already documented
// `--parallel` as non-deterministic by design).
let fired = if kernel.parallel_active {
kernel.interrupts.tick_vsync_wallclock()
} else {
kernel.interrupts.tick_vsync_instr(stats.instruction_count)
};
if fired {
use std::sync::atomic::Ordering;
let mmio = kernel.gpu.mmio();
let prev = mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed);
mmio.d1mode_vblank_vline_status
.store(prev | 0x1, Ordering::Relaxed);
}
if kernel.xaudio_tick_enabled {
if kernel.parallel_active {
kernel.xaudio.tick_wallclock();
} else {
kernel.xaudio.tick_instr(stats.instruction_count);
}
}
kernel.fire_due_timers();
try_inject_graphics_interrupt(kernel);
if kernel.xaudio_tick_enabled {
try_inject_audio_callback(kernel);
}
RoundCtl::Continue
}
/// Per-round idle-advance: invoked when `round_schedule()` returns no
/// runnable threads. Advances time to the earliest pending timer or
/// timed-wait deadline, fires due timers, and handles deadline wakes.
/// On hard deadlock either halts (with diagnostics) or force-wakes
/// blocked waiters with `STATUS_TIMEOUT`. `Continue` restarts the
/// outer loop; `BreakOuter` halts the scheduler.
fn coord_idle_advance(
kernel: &mut xenia_kernel::KernelState,
halt_on_deadlock: bool,
shutdown: &Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
stats: &ExecStats,
) -> RoundCtl {
let next_timer = kernel.earliest_timer_deadline();
let next_wait = kernel.scheduler.earliest_wait_deadline();
let target = match (next_timer, next_wait) {
(Some(a), Some(b)) => Some(a.min(b)),
(Some(a), None) => Some(a),
(None, Some(b)) => Some(b),
(None, None) => None,
};
if let Some(target) = target {
kernel.scheduler.advance_all_timebases_to(target);
kernel.fire_due_timers();
if let Some((r, reason)) = kernel.scheduler.advance_to_next_wake_if_due(target) {
kernel.handle_timeout_wake(r, reason);
}
return RoundCtl::Continue;
}
if kernel.scheduler.has_live_thread() {
if halt_on_deadlock {
tracing::warn!(
instructions = stats.instruction_count,
"scheduler deadlock: halting (--halt-on-deadlock); dumping per-thread diagnostic"
);
for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
if slot.runqueue.is_empty() {
tracing::warn!(hw = hw_id, state = "empty", "deadlock slot");
continue;
}
for (idx, t) in slot.runqueue.iter().enumerate() {
tracing::warn!(
hw = hw_id,
idx = idx,
tid = t.tid,
state = ?t.state,
pc = format_args!("{:#010x}", t.ctx.pc),
lr = format_args!("{:#010x}", t.ctx.lr as u32),
sp = format_args!("{:#010x}", t.ctx.gpr[1] as u32),
"deadlock thread"
);
}
}
metrics::counter!("scheduler.deadlock_halts").increment(1);
if let Some(flag) = shutdown {
flag.store(true, std::sync::atomic::Ordering::Relaxed);
}
return RoundCtl::BreakOuter;
}
tracing::warn!(
snapshot = ?kernel.scheduler.diagnostic_snapshot(),
"scheduler deadlock: force-waking blocked waiters with STATUS_TIMEOUT"
);
let woken = kernel.scheduler.unblock_on_deadlock();
// STATUS_TIMEOUT = 0x00000102 per NT conventions.
const STATUS_TIMEOUT: u64 = 0x0000_0102;
for r in &woken {
kernel.scheduler.ctx_mut_ref(*r).gpr[3] = STATUS_TIMEOUT;
}
metrics::counter!("scheduler.deadlock_recoveries").increment(1);
if woken.is_empty() {
info!("scheduler: deadlock recovery woke zero threads — halting");
return RoundCtl::BreakOuter;
}
return RoundCtl::Continue;
}
info!("scheduler: no live HW threads remain — halting");
RoundCtl::BreakOuter
}
/// Per-round epilogue: clear the slot visit, drive the inline GPU
/// proportional to instructions executed in this round, drain any
/// GPU-side pending interrupts, and check liveness. Returns
/// `BreakOuter` when no live HW threads remain.
fn coord_post_round(
kernel: &mut xenia_kernel::KernelState,
mem: &xenia_memory::GuestMemory,
stats: &ExecStats,
instrs_at_round_start: u64,
) -> RoundCtl {
kernel.scheduler.end_slot_visit();
let executed_this_round = stats
.instruction_count
.saturating_sub(instrs_at_round_start);
let mut gpu_runs = (executed_this_round
/ xenia_cpu::scheduler::HW_THREAD_COUNT as u64)
.max(1);
if gpu_runs > 64 {
gpu_runs = 64;
}
if let Some(gpu) = kernel.gpu.as_inline_mut() {
gpu.sync_with_mmio();
for _ in 0..gpu_runs {
if gpu.is_ready(mem) {
let _ = gpu.execute_one(mem);
gpu.sync_with_mmio();
} else {
break;
}
}
} else {
let _ = gpu_runs;
}
if kernel.gpu.has_pending_interrupts() {
for _pi in kernel.gpu.take_pending_interrupts() {
kernel
.interrupts
.queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP);
}
}
if !kernel.scheduler.has_live_thread() {
info!("scheduler: all threads exited — halting");
return RoundCtl::BreakOuter;
}
RoundCtl::Continue
}
/// Per-worker state for one HW slot. Owns its block cache + decode
/// cache so that under per-HW-thread parallelism (Step 04) the worker
/// thread can step its slot without sharing mutable cache state with
/// peers. In lockstep mode this is just six independent caches on the
/// same host thread — observable behavior identical, slightly higher
/// per-slot fill churn.
struct WorkerCtx {
hw_id: u8,
block_cache: xenia_cpu::block_cache::BlockCache,
decode_cache: xenia_cpu::decoder::DecodeCache,
force_per_instr: bool,
}
impl WorkerCtx {
fn new(hw_id: u8, force_per_instr: bool) -> Self {
Self {
hw_id,
block_cache: xenia_cpu::block_cache::BlockCache::new(),
decode_cache: xenia_cpu::decoder::DecodeCache::new(),
force_per_instr,
}
}
}
/// Outcome of `worker_prologue`. Either the slot was fully handled
/// inline (halt-sentinel, thunk dispatch, per-instruction step), or
/// the caller must run a block-cache `step_block` and then call
/// `worker_epilogue`, or the outer scheduler loop must break (fault,
/// debugger break, main-thread halt).
enum PrologueOutcome {
Continue,
BreakOuter,
StepBlock {
tid: Option<u32>,
thread_ref: xenia_cpu::ThreadRef,
block_ptr: *const xenia_cpu::block_cache::DecodedBlock,
pc_before: u32,
},
}
enum SlotOutcome {
Continue,
BreakOuter,
}
/// Per-slot prologue. Performs `begin_slot_visit`, halt-sentinel
/// detection + restore, import-thunk dispatch (under the kernel lock),
/// unmapped-PC fault check, and either kicks off a block-cache step
/// (returning `StepBlock`) or runs the per-instruction observation
/// path inline (debugger / DB writer). When this returns
/// `PrologueOutcome::Continue` the caller skips to the next slot;
/// `BreakOuter` exits the scheduler loop; `StepBlock` hands off to a
/// `step_block(ctx, mem, &*block_ptr)` call followed by
/// `worker_epilogue`.
fn worker_prologue(
wc: &mut WorkerCtx,
kernel: &mut xenia_kernel::KernelState,
mem: &xenia_memory::GuestMemory,
debugger: &mut xenia_debugger::Debugger,
db_writer: &mut Option<&mut xenia_analysis::DbWriter>,
thunk_map: &HashMap<u32, (ModuleId, u16, String)>,
stats: &mut ExecStats,
) -> PrologueOutcome {
use xenia_cpu::interpreter::{step_cached, StepResult};
use xenia_cpu::scheduler::{HwState, INITIAL_GUEST_TID};
use xenia_cpu::PpcOpcode;
const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;
let hw_id = wc.hw_id;
kernel.scheduler.begin_slot_visit(hw_id);
// Under per-HW-thread parallelism (run_execution_parallel) the
// coordinator's runnable-mask snapshot may be stale by the time
// this worker picks up the lock — a peer worker's call_export
// could have blocked the slot's last Ready thread. begin_slot_visit
// then sets running_idx = None and ctx(hw_id) returns the idle
// sentinel (pc = 0). Without this short-circuit the prologue would
// fall through to the unmapped-PC fault and break the outer loop.
if kernel.scheduler.current.is_none() {
return PrologueOutcome::Continue;
}
let pc = kernel.scheduler.ctx(hw_id).pc;
// 1) Halt-sentinel check (per HW thread).
if pc == LR_HALT {
let injected_here = kernel.interrupts.saved.is_some()
&& kernel
.interrupts
.injected_ref
.map(|r| r.hw_id == hw_id)
== Some(true);
if injected_here
&& let Some(saved) = kernel.interrupts.saved.take()
{
let target_ref = kernel
.interrupts
.injected_ref
.expect("injected_here guard");
saved.restore(kernel.scheduler.ctx_mut_ref(target_ref));
kernel.interrupts.delivered += 1;
let source = saved.source;
let mut restore_outcome = "ready";
let current = kernel.scheduler.thread(target_ref).state.clone();
if let HwState::ServicingIrq(reason) = current {
kernel.scheduler.thread_mut(target_ref).state =
HwState::Blocked(reason);
restore_outcome = "reblocked";
}
tracing::debug!(
source,
hw_id,
outcome = restore_outcome,
"graphics interrupt: callback returned"
);
return PrologueOutcome::Continue;
}
let tid = kernel.scheduler.tid(hw_id);
tracing::info!(
hw_id,
?tid,
is_main = tid == Some(INITIAL_GUEST_TID),
cycle = stats.instruction_count,
"HW thread returned to LR sentinel — marking exited"
);
let (_, _exited_tid, handle_opt) = kernel.scheduler.exit_current(0);
if let Some(h) = handle_opt
&& let Some(xenia_kernel::objects::KernelObject::Thread {
exit_code,
waiters,
..
}) = kernel.objects.get_mut(&h)
{
*exit_code = Some(0);
let to_wake: Vec<xenia_cpu::ThreadRef> = std::mem::take(waiters);
for w in to_wake {
kernel.scheduler.wake_ref(w);
}
}
return PrologueOutcome::Continue;
}
// 2) Import thunk intercept.
if let Some((module, ordinal, name)) = thunk_map.get(&pc) {
let module = *module;
let ordinal_u32 = *ordinal as u32;
let thunk_pc = pc;
let args = {
let c = kernel.scheduler.ctx(hw_id);
[c.gpr[3], c.gpr[4], c.gpr[5], c.gpr[6]]
};
kernel.call_export(module, ordinal_u32, mem);
let post_ref = kernel.scheduler.current;
let c = match post_ref {
Some(r) => kernel.scheduler.ctx_mut_ref(r),
None => kernel.scheduler.ctx_mut(hw_id),
};
let return_value = c.gpr[3];
c.pc = c.lr as u32;
c.cycle_count += 1;
c.timebase += 1;
if let Some(db) = db_writer.as_deref_mut() {
db.log_import_call(xenia_analysis::ImportCallEntry {
address: thunk_pc,
cycle: c.cycle_count,
module: match module {
ModuleId::Xboxkrnl => "xboxkrnl.exe".to_string(),
ModuleId::Xam => "xam.xex".to_string(),
ModuleId::Xbdm => "xbdm.xex".to_string(),
},
ordinal: *ordinal,
name: name.clone(),
arg_r3: args[0],
arg_r4: args[1],
arg_r5: args[2],
arg_r6: args[3],
return_value,
});
}
stats.instruction_count += 1;
stats.import_count += 1;
kernel.scheduler.decrement_quantum();
return PrologueOutcome::Continue;
}
// 3) Unmapped PC.
if !mem.is_mapped(pc) {
tracing::error!(
cycle = stats.instruction_count,
pc = format_args!("{:#010x}", pc),
hw_id,
"FAULT: PC in unmapped memory"
);
return PrologueOutcome::BreakOuter;
}
// 4) Decide block-cache fast path vs per-instruction observation.
let hooks = debugger.wants_hooks();
let observe_per_instruction = hooks || db_writer.is_some() || wc.force_per_instr;
if !observe_per_instruction {
// Block-cache fast path: look up the block under the kernel
// lock and hand it to the caller via raw pointer. The pointer
// is sound for the duration of this slot's step_block call
// because `wc.block_cache` is owned by this worker (no peer
// mutation possible) and the cache slot is not touched again
// until the next prologue iteration.
let tid = kernel.scheduler.tid(hw_id);
let thread_ref = kernel
.scheduler
.current
.expect("begin_slot_visit set scheduler.current to Some when slot has runnable thread");
let block_ptr: *const xenia_cpu::block_cache::DecodedBlock = {
let pc_for_lookup = kernel.scheduler.ctx(hw_id).pc;
let b: &xenia_cpu::block_cache::DecodedBlock =
wc.block_cache.lookup_or_build(pc_for_lookup, mem);
b
};
return PrologueOutcome::StepBlock {
tid,
thread_ref,
block_ptr,
pc_before: pc,
};
}
// 5) Per-instruction observation path (debugger / DB).
let result = {
let ctx = kernel.scheduler.ctx_mut(hw_id);
if hooks {
debugger.pre_step(ctx, mem);
}
let page_ver = mem.page_version(ctx.pc);
let r = step_cached(ctx, mem, &mut wc.decode_cache, page_ver);
stats.instruction_count += 1;
if let Some(db) = db_writer.as_deref_mut() {
let raw_before = mem.read_u32(pc);
let opcode_before = xenia_cpu::decode(raw_before, pc).opcode;
db.log_instruction(xenia_analysis::ExecTraceEntry {
address: pc,
cycle: ctx.cycle_count,
r3: ctx.gpr[3],
r4: ctx.gpr[4],
lr: ctx.lr,
sp: ctx.gpr[1],
});
if opcode_before.is_branch() && ctx.pc != pc.wrapping_add(4) {
let lk = (raw_before & 1) == 1;
let kind: &'static str = if lk {
"call"
} else if opcode_before == PpcOpcode::bclrx {
"return"
} else if opcode_before == PpcOpcode::bcctrx {
"jump"
} else {
"branch"
};
db.log_branch(xenia_analysis::BranchTraceEntry {
cycle: ctx.cycle_count,
source: pc,
target: ctx.pc,
kind,
lr: ctx.lr,
});
}
}
if hooks {
debugger.post_step(ctx, mem);
}
r
};
match result {
StepResult::Continue => {}
StepResult::SystemCall => {
tracing::warn!("SYSCALL at {:#010x} (hw={})", pc, hw_id);
}
StepResult::Unimplemented(op) => {
stats.unimpl_count += 1;
metrics::counter!("cpu.unimplemented").increment(1);
if stats.unimpl_count <= 50 {
warn!(
cycle = stats.instruction_count,
?op,
pc = format_args!("{:#010x}", pc),
hw_id,
"UNIMPL"
);
} else if stats.unimpl_count == 51 {
info!("suppressing further UNIMPL messages");
}
}
StepResult::Trap => {
tracing::warn!(pc = format_args!("{:#010x}", pc), hw_id, "TRAP");
}
StepResult::Halted => {
info!(cycle = stats.instruction_count, hw_id, "HALTED");
let tid = kernel.scheduler.tid(hw_id);
if tid == Some(INITIAL_GUEST_TID) {
return PrologueOutcome::BreakOuter;
}
kernel.scheduler.exit_current(0);
}
}
kernel.scheduler.decrement_quantum();
if debugger.should_break() {
let pc_now = kernel.scheduler.ctx(hw_id).pc;
info!(
cycle = stats.instruction_count,
pc = format_args!("{:#010x}", pc_now),
hw_id,
"BREAK"
);
return PrologueOutcome::BreakOuter;
}
PrologueOutcome::Continue
}
/// Per-slot epilogue for the block-cache fast path. Charges
/// quantum decrements, applies the StepResult, and returns whether
/// the outer scheduler loop should continue or break. SAFETY: caller
/// must guarantee `block_ptr` still points at a live `DecodedBlock`
/// (it does, because the worker owns `wc.block_cache` and didn't
/// re-enter `lookup_or_build` between prologue and epilogue).
fn worker_epilogue(
wc: &mut WorkerCtx,
kernel: &mut xenia_kernel::KernelState,
debugger: &mut xenia_debugger::Debugger,
stats: &mut ExecStats,
_tid: Option<u32>,
_thread_ref: xenia_cpu::ThreadRef,
block_ptr: *const xenia_cpu::block_cache::DecodedBlock,
pc_before: u32,
result: xenia_cpu::interpreter::StepResult,
executed: u64,
) -> SlotOutcome {
use xenia_cpu::interpreter::StepResult;
use xenia_cpu::scheduler::INITIAL_GUEST_TID;
let hw_id = wc.hw_id;
let block = unsafe { &*block_ptr };
stats.instruction_count = stats.instruction_count.wrapping_add(executed);
for _ in 0..executed {
kernel.scheduler.decrement_quantum();
}
match result {
StepResult::Continue => {}
StepResult::SystemCall => {
let last_pc = block.instrs.last().map(|i| i.addr).unwrap_or(pc_before);
tracing::warn!("SYSCALL at {:#010x} (hw={})", last_pc, hw_id);
}
StepResult::Unimplemented(op) => {
stats.unimpl_count += 1;
metrics::counter!("cpu.unimplemented").increment(1);
let unimpl_pc = kernel.scheduler.ctx(hw_id).pc.wrapping_sub(4);
if stats.unimpl_count <= 50 {
warn!(
cycle = stats.instruction_count,
?op,
pc = format_args!("{:#010x}", unimpl_pc),
hw_id,
"UNIMPL"
);
} else if stats.unimpl_count == 51 {
info!("suppressing further UNIMPL messages");
}
}
StepResult::Trap => {
let last_pc = block.instrs.last().map(|i| i.addr).unwrap_or(pc_before);
tracing::warn!(pc = format_args!("{:#010x}", last_pc), hw_id, "TRAP");
}
StepResult::Halted => {
info!(cycle = stats.instruction_count, hw_id, "HALTED");
let tid = kernel.scheduler.tid(hw_id);
if tid == Some(INITIAL_GUEST_TID) {
return SlotOutcome::BreakOuter;
}
kernel.scheduler.exit_current(0);
}
}
if debugger.should_break() {
let pc_now = kernel.scheduler.ctx(hw_id).pc;
info!(
cycle = stats.instruction_count,
pc = format_args!("{:#010x}", pc_now),
hw_id,
"BREAK"
);
return SlotOutcome::BreakOuter;
}
SlotOutcome::Continue
}
#[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))]
fn run_execution(
mem: &xenia_memory::GuestMemory,
kernel: &mut xenia_kernel::KernelState,
debugger: &mut xenia_debugger::Debugger,
thunk_map: &HashMap<u32, (ModuleId, u16, String)>,
mut db_writer: Option<&mut xenia_analysis::DbWriter>,
max_instructions: Option<u64>,
ips_limit: Option<u64>,
quiet: bool,
halt_on_deadlock: bool,
shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
) -> ExecStats {
use xenia_cpu::interpreter::step_block;
let mut stats = ExecStats::default();
let _ = quiet; // retained for future per-kind suppression
// `--halt-on-deadlock` CLI flag OR `XENIA_HALT_ON_DEADLOCK=1|true` env var:
// when the scheduler next hits a hard deadlock (every live HW thread
// blocked on a handle wait with no pending timer) we bail out with a
// per-thread diagnostic instead of force-waking waiters with
// STATUS_TIMEOUT. Env-var fallback is cheap and lets `just` recipes /
// one-off shells flip the behaviour without rewiring the subcommand.
let halt_on_deadlock = halt_on_deadlock
|| std::env::var("XENIA_HALT_ON_DEADLOCK")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false);
// Throttle anchor: wall-clock at which instruction 0 was retired. Used to
// pace execution when `ips_limit` is set — each round we compute the
// expected elapsed time for the current retired-count and sleep the
// difference if we're running ahead of schedule.
let throttle_start = Instant::now();
// Escape hatch for A/B testing: when set, every dispatch goes
// through `step_cached` even outside the debugger/DB trace modes.
// Useful for pinpointing whether a behaviour regression is the
// block-cache fast path's fault or something else.
let force_per_instr = std::env::var("XENIA_FORCE_PER_INSTR")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false);
// M3.2 / M3-realpar Step 02 — six per-HW-slot worker contexts, each
// owning its own block cache + decode cache. Under per-HW-thread
// parallelism (Step 04) each WorkerCtx will be moved into its own
// worker thread; in lockstep it's six independent caches on the
// same host thread. The block cache (1.5 MiB-ish per slot) lives
// here so the prologue/epilogue split can borrow it directly.
let mut workers: [WorkerCtx; xenia_cpu::scheduler::HW_THREAD_COUNT] =
std::array::from_fn(|i| WorkerCtx::new(i as u8, force_per_instr));
'outer: loop {
// Per-round prologue: budget / shutdown / heartbeat / vsync /
// timers / graphics-interrupt injection. Carved into
// `coord_pre_round` so the parallel scheduler (Step 03+) can
// call the same coordination logic between phaser barriers
// without duplicating it from the lockstep path.
match coord_pre_round(
kernel,
&stats,
max_instructions,
ips_limit,
throttle_start,
&shutdown,
) {
RoundCtl::BreakOuter => break,
RoundCtl::Continue => {}
}
// Snapshot round schedule. `round_schedule` also advances rng state
// when seeded; mutation is intentional.
kernel.scheduler.begin_round();
let order = kernel.scheduler.round_schedule();
if order.is_empty() {
// No Ready threads — advance time to the earliest pending
// deadline, fire timers, handle deadline wakes, and on hard
// deadlock either halt (with diagnostics) or force-wake.
// Logic carved into `coord_idle_advance`.
match coord_idle_advance(kernel, halt_on_deadlock, &shutdown, &stats) {
RoundCtl::BreakOuter => break,
RoundCtl::Continue => continue,
}
}
// Tier-4 perf: track guest-instruction progress within this
// outer round so the post-round GPU pacer can run the GPU at a
// rate proportional to executed instructions. With the
// block-cache fast path executing up to MAX_BLOCK_INSTRS guest
// instructions per inner iteration vs the per-instruction
// path's 1, a fixed "once per round" pace would starve the
// GPU when block dispatch engages.
let instrs_at_round_start = stats.instruction_count;
for hw_id in order {
let wc = &mut workers[hw_id as usize];
match worker_prologue(
wc,
kernel,
mem,
debugger,
&mut db_writer,
thunk_map,
&mut stats,
) {
PrologueOutcome::Continue => continue,
PrologueOutcome::BreakOuter => break 'outer,
PrologueOutcome::StepBlock {
tid,
thread_ref,
block_ptr,
pc_before,
} => {
// Block-cache step. The lockstep path keeps the
// kernel state borrowed straight through (single
// host thread, no contention). Step 03 of the
// M3 real-parallelism plan introduces a
// drop-and-reacquire window around `step_block`
// for the parallel branch.
let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count;
let block = unsafe { &*block_ptr };
let result = {
let ctx = kernel.scheduler.ctx_mut_ref(thread_ref);
step_block(ctx, mem, block)
};
let executed = kernel
.scheduler
.ctx_mut_ref(thread_ref)
.cycle_count
.saturating_sub(cycle_before);
match worker_epilogue(
wc,
kernel,
debugger,
&mut stats,
tid,
thread_ref,
block_ptr,
pc_before,
result,
executed,
) {
SlotOutcome::Continue => continue,
SlotOutcome::BreakOuter => break 'outer,
}
}
}
}
// Per-round epilogue: end the slot visit, drive the inline
// GPU proportional to instructions executed, drain GPU-side
// pending interrupts, and break the outer loop if no live HW
// threads remain. Carved into `coord_post_round`.
match coord_post_round(kernel, mem, &stats, instrs_at_round_start) {
RoundCtl::BreakOuter => break,
RoundCtl::Continue => {}
}
}
stats
}
/// M3 real-parallelism (Step 04) — N=6 worker threads + main-thread
/// coordinator + 7-party phaser. Workers each own a `WorkerCtx`
/// (per-slot block + decode caches), call `worker_prologue` /
/// `step_block` / `worker_epilogue` on their slot, releasing the
/// kernel mutex around the lock-free `step_block`. Coordinator runs
/// `coord_pre_round` / `coord_idle_advance` / `coord_post_round`
/// between two phaser barriers per round.
///
/// Phaser (party_count = 7):
/// - B1 (round-start gate): coordinator publishes runnable mask + drops
/// kernel lock; workers wait. After B1 releases, workers see the
/// mask and either (a) acquire the lock and process their slot,
/// or (b) `phaser.skip(hw_id)` to skip B2.
/// - B2 (round-end gate): workers signal completion. Coordinator
/// reacquires the kernel lock and runs `coord_post_round`.
///
/// Per-worker locking dance (when runnable):
/// 1. Acquire kernel lock (stats lock briefly nested).
/// 2. `worker_prologue` → `Continue` | `BreakOuter` | `StepBlock`.
/// 3. On `StepBlock`: `mem::replace` ctx out; clear `scheduler.current`
/// via `end_slot_visit`; drop kernel lock.
/// 4. `step_block` on the local ctx (no lock held — actual parallelism).
/// 5. Reacquire lock; `find_by_tid(tid)` to resolve post-migration
/// ThreadRef; write ctx back; set `scheduler.current = Some(target_ref)`
/// for the epilogue's `exit_current` path.
/// 6. `worker_epilogue` (under lock + stats lock briefly).
/// 7. Clear `scheduler.current = None`; drop kernel lock.
/// 8. `phaser.arrive_and_wait(hw_id)` for B2.
///
/// Lock order: kernel lock first, stats mutex second. No inversion.
/// Workers never take stats without kernel; coordinator follows the
/// same order.
///
/// The parallel path is incompatible with debugger hooks, the DB
/// writer, and `XENIA_FORCE_PER_INSTR=1` (these all force the
/// per-instruction path which serializes the per-instruction
/// `step_cached` under the kernel lock — no parallelism gain). We
/// panic at entry rather than degrade silently.
fn run_execution_parallel(
mem: &xenia_memory::GuestMemory,
kernel_arc: &std::sync::Arc<std::sync::Mutex<xenia_kernel::KernelState>>,
debugger: &mut xenia_debugger::Debugger,
thunk_map: &HashMap<u32, (ModuleId, u16, String)>,
mut db_writer: Option<&mut xenia_analysis::DbWriter>,
max_instructions: Option<u64>,
ips_limit: Option<u64>,
quiet: bool,
halt_on_deadlock: bool,
shutdown_outer: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
) -> ExecStats {
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};
use xenia_cpu::interpreter::step_block;
use xenia_cpu::{Phaser, PhaserOutcome};
let _ = quiet;
let _ = debugger; // Verified incompatible just below; kept in signature for symmetry with run_execution.
// Defense-in-depth: cmd_exec_inner should already gate these out
// with a friendly error, but if we got here with hooks attached,
// the parallel path can't honor them.
assert!(
!debugger.wants_hooks(),
"--parallel mode is incompatible with debugger hooks (per-instruction path required)"
);
assert!(
db_writer.is_none(),
"--parallel mode is incompatible with --db / DB writer"
);
assert!(
!std::env::var("XENIA_FORCE_PER_INSTR")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false),
"--parallel mode is incompatible with XENIA_FORCE_PER_INSTR=1"
);
let _ = (debugger, db_writer.take()); // suppress 'unused mut' / 'unused' on the bound param
let halt_on_deadlock = halt_on_deadlock
|| std::env::var("XENIA_HALT_ON_DEADLOCK")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false);
let throttle_start = Instant::now();
const COORD_ID: u8 = xenia_cpu::scheduler::HW_THREAD_COUNT as u8; // = 6
const PARTY_COUNT: u32 = xenia_cpu::scheduler::HW_THREAD_COUNT as u32 + 1;
let phaser: Arc<Phaser> = Arc::new(Phaser::new(PARTY_COUNT));
let internal_shutdown: Arc<AtomicBool> = Arc::new(AtomicBool::new(false));
let runnable_mask: [Arc<AtomicBool>; xenia_cpu::scheduler::HW_THREAD_COUNT] =
std::array::from_fn(|_| Arc::new(AtomicBool::new(false)));
// (Reserved for the race-free parking work deferred from Step 05;
// see the Step 05 memo for the analysis. For now the per-worker
// handle table is unused — workers always arrive at B1 and the
// coordinator never calls unpark.)
// Stats lives behind a Mutex shared between coordinator and workers.
// Lock order: kernel mutex first, stats mutex second. Stats is
// accessed only briefly (the duration of a helper invocation).
let stats_mtx: Mutex<ExecStats> = Mutex::new(ExecStats::default());
std::thread::scope(|scope| {
for hw_id in 0..xenia_cpu::scheduler::HW_THREAD_COUNT as u8 {
let phaser_w: Arc<Phaser> = phaser.clone();
let kernel_w = kernel_arc.clone();
let shutdown_w: Arc<AtomicBool> = internal_shutdown.clone();
let runnable_w: Arc<AtomicBool> = runnable_mask[hw_id as usize].clone();
let stats_ref: &Mutex<ExecStats> = &stats_mtx;
let mem_ref: &xenia_memory::GuestMemory = mem;
let thunk_map_ref: &HashMap<u32, (ModuleId, u16, String)> = thunk_map;
scope
.spawn(move || {
// Workers run with no debugger hooks and no DB
// writer (asserted in run_execution_parallel
// entry). The local debugger is constructed in
// the same "cold-run" shape as the main-thread
// one (paused=false, step_mode=Run,
// trace_enabled=false) so wants_hooks() is false
// and worker_prologue takes the block-cache fast
// path. Without this `Debugger::new()` defaults
// to paused/trace-enabled and worker_prologue's
// per-instruction path returns BreakOuter on the
// first should_break check.
let mut wc = WorkerCtx::new(hw_id, /*force_per_instr=*/false);
let mut local_debugger = xenia_debugger::Debugger::new();
local_debugger.paused = false;
local_debugger.step_mode = xenia_debugger::StepMode::Run;
local_debugger.trace_enabled = false;
let mut local_db_writer: Option<&mut xenia_analysis::DbWriter> = None;
'worker: loop {
// B1: round-start gate. Workers ALWAYS arrive
// at B1 — the phaser advance is the only race-
// free synchronization with the coordinator's
// mask publish. Reading runnable_w only after
// B1's Release-Acquire pair guarantees we see
// the current round's mask, not a stale one.
match phaser_w.arrive_and_wait(hw_id) {
PhaserOutcome::Advanced => {}
PhaserOutcome::Shutdown => break 'worker,
PhaserOutcome::Timeout => {
tracing::warn!(
hw_id,
"worker: phaser B1 timeout — peer hung; shutting down"
);
shutdown_w.store(true, Ordering::Release);
phaser_w.shutdown();
break 'worker;
}
}
if shutdown_w.load(Ordering::Acquire) {
break 'worker;
}
// Read mask AFTER B1 advance (Acquire pair).
if !runnable_w.load(Ordering::Acquire) {
phaser_w.skip(hw_id);
continue;
}
// We have work. Acquire the kernel lock.
let prologue_outcome = {
let mut guard = kernel_w.lock().expect("kernel mutex poisoned");
let mut s = stats_ref.lock().expect("stats mutex poisoned");
let r = worker_prologue(
&mut wc,
&mut *guard,
mem_ref,
&mut local_debugger,
&mut local_db_writer,
thunk_map_ref,
&mut *s,
);
// Drop stats lock first (reverse acquisition order).
drop(s);
// Don't drop the kernel guard yet for the
// StepBlock arm — we need to mem::replace
// before unlock.
(r, guard)
};
match prologue_outcome.0 {
PrologueOutcome::Continue => {
// Halt-sentinel restore, thunk
// dispatch, or per-instruction step
// ran inline. Drop the lock and arrive
// at B2.
drop(prologue_outcome.1);
}
PrologueOutcome::BreakOuter => {
drop(prologue_outcome.1);
shutdown_w.store(true, Ordering::Release);
phaser_w.shutdown();
break 'worker;
}
PrologueOutcome::StepBlock {
tid,
thread_ref,
block_ptr,
pc_before,
} => {
let mut guard = prologue_outcome.1;
// Snapshot ctx into a local; replace
// the in-scheduler ctx with a fresh
// (zeroed) PpcContext so peers can't
// observe stale state.
let mut ctx_taken = std::mem::replace(
guard.scheduler.ctx_mut_ref(thread_ref),
xenia_cpu::PpcContext::new(),
);
let cycle_before = ctx_taken.cycle_count;
// Clear scheduler.current so peers
// don't see this slot as "running"
// while the lock is unheld.
guard.scheduler.end_slot_visit();
drop(guard);
// ── unlocked window ───────────────
let block = unsafe { &*block_ptr };
let result = step_block(&mut ctx_taken, mem_ref, block);
let executed = ctx_taken
.cycle_count
.saturating_sub(cycle_before);
// ──────────────────────────────────
let mut guard = kernel_w.lock().expect("kernel mutex poisoned");
// find_by_tid handles cross-worker
// migration (a peer's call_export may
// have moved this thread between slots).
let target_ref = tid
.and_then(|t| guard.scheduler.find_by_tid(t))
.unwrap_or(thread_ref);
*guard.scheduler.ctx_mut_ref(target_ref) = ctx_taken;
// worker_epilogue's exit_current path
// expects scheduler.current to be set
// to the running thread.
guard.scheduler.current = Some(target_ref);
let epilogue_outcome = {
let mut s =
stats_ref.lock().expect("stats mutex poisoned");
let r = worker_epilogue(
&mut wc,
&mut *guard,
&mut local_debugger,
&mut *s,
tid,
target_ref,
block_ptr,
pc_before,
result,
executed,
);
drop(s);
r
};
// Clear current after epilogue so the
// next worker that picks up the lock
// doesn't observe it.
guard.scheduler.current = None;
drop(guard);
if matches!(epilogue_outcome, SlotOutcome::BreakOuter) {
shutdown_w.store(true, Ordering::Release);
phaser_w.shutdown();
break 'worker;
}
}
}
// B2: round-end gate. Coordinator wakes when
// all 6 workers + coordinator have arrived
// (or skipped B2).
match phaser_w.arrive_and_wait(hw_id) {
PhaserOutcome::Advanced => {}
PhaserOutcome::Shutdown => break 'worker,
PhaserOutcome::Timeout => {
tracing::warn!(
hw_id,
"worker: phaser B2 timeout — peer hung; shutting down"
);
shutdown_w.store(true, Ordering::Release);
phaser_w.shutdown();
break 'worker;
}
}
}
});
}
// Coordinator runs on the main (calling) thread.
'outer: loop {
// Pre-round under kernel + stats locks.
let pre_outcome = {
let mut guard = kernel_arc.lock().expect("kernel mutex poisoned");
let r = {
let s = stats_mtx.lock().expect("stats mutex poisoned");
coord_pre_round(
&mut *guard,
&*s,
max_instructions,
ips_limit,
throttle_start,
&shutdown_outer,
)
};
(r, guard)
};
match pre_outcome.0 {
RoundCtl::BreakOuter => {
drop(pre_outcome.1);
internal_shutdown.store(true, Ordering::Release);
phaser.shutdown();
break 'outer;
}
RoundCtl::Continue => {}
}
let mut guard = pre_outcome.1;
guard.scheduler.begin_round();
let order = guard.scheduler.round_schedule();
// Publish runnable mask for workers. All 6 workers arrive
// at B1 unconditionally; the mask is published before
// coord arrives so the phaser's Release on advance pairs
// with workers' Acquire-after-B1 read of the mask.
for id in 0..xenia_cpu::scheduler::HW_THREAD_COUNT as u8 {
let runnable = order.contains(&id);
runnable_mask[id as usize].store(runnable, Ordering::Release);
}
let instrs_at_round_start =
stats_mtx.lock().expect("stats mutex poisoned").instruction_count;
if order.is_empty() {
let r = {
let s = stats_mtx.lock().expect("stats mutex poisoned");
coord_idle_advance(&mut *guard, halt_on_deadlock, &shutdown_outer, &*s)
};
drop(guard);
match r {
RoundCtl::BreakOuter => {
internal_shutdown.store(true, Ordering::Release);
phaser.shutdown();
break 'outer;
}
RoundCtl::Continue => {
// Workers see runnable_mask all-false at B1 and
// skip B2. Coordinator participates in both
// barriers so the phase advances normally.
match phaser.arrive_and_wait(COORD_ID) {
PhaserOutcome::Advanced => {}
PhaserOutcome::Shutdown => break 'outer,
PhaserOutcome::Timeout => {
tracing::warn!(
"coordinator: phaser B1 (idle) timeout; shutting down"
);
internal_shutdown.store(true, Ordering::Release);
phaser.shutdown();
break 'outer;
}
}
match phaser.arrive_and_wait(COORD_ID) {
PhaserOutcome::Advanced => {}
PhaserOutcome::Shutdown => break 'outer,
PhaserOutcome::Timeout => {
tracing::warn!(
"coordinator: phaser B2 (idle) timeout; shutting down"
);
internal_shutdown.store(true, Ordering::Release);
phaser.shutdown();
break 'outer;
}
}
continue;
}
}
}
drop(guard); // Release lock so workers can acquire for prologue/epilogue.
// B1: signal workers to start. All 6 workers arrive at
// B1 unconditionally; coord is the 7th party.
match phaser.arrive_and_wait(COORD_ID) {
PhaserOutcome::Advanced => {}
PhaserOutcome::Shutdown => break 'outer,
PhaserOutcome::Timeout => {
tracing::warn!("coordinator: phaser B1 timeout; shutting down");
internal_shutdown.store(true, Ordering::Release);
phaser.shutdown();
break 'outer;
}
}
// Workers process slots concurrently. Idle-slot workers
// skip B2 (the coordinator does NOT skip on their behalf —
// see the race analysis in the Step 05 memo).
// B2: wait for all workers to finish.
match phaser.arrive_and_wait(COORD_ID) {
PhaserOutcome::Advanced => {}
PhaserOutcome::Shutdown => break 'outer,
PhaserOutcome::Timeout => {
tracing::warn!("coordinator: phaser B2 timeout; shutting down");
internal_shutdown.store(true, Ordering::Release);
phaser.shutdown();
break 'outer;
}
}
// All workers quiescent. Reacquire lock for post-round.
let post_outcome = {
let mut guard = kernel_arc.lock().expect("kernel mutex poisoned");
let r = {
let s = stats_mtx.lock().expect("stats mutex poisoned");
coord_post_round(&mut *guard, mem, &*s, instrs_at_round_start)
};
drop(guard);
r
};
match post_outcome {
RoundCtl::BreakOuter => {
internal_shutdown.store(true, Ordering::Release);
phaser.shutdown();
break 'outer;
}
RoundCtl::Continue => {}
}
}
}); // <- thread::scope joins all workers here.
stats_mtx.into_inner().expect("stats mutex poisoned")
}
/// First-Pixels M2 — inject a queued graphics interrupt into HW thread 0
/// when it's safe to do so (callback registered, no interrupt already
/// running). Called at the top of each scheduler round.
///
/// Unlike the earlier P6 version which only delivered when HW 0 was
/// `Ready`, this one also delivers when HW 0 is `Blocked`: the injector
/// stashes the block reason into the new `HwState::ServicingIrq(reason)`
/// variant, flips the thread to that state so `round_schedule` runs it,
/// and — on callback return to `LR_HALT_SENTINEL` — the restore path
/// re-creates `Blocked(reason)`, unless a `wake()` during the callback
/// (e.g. `KeSetEvent` → `wake_eligible_waiters`) flipped it to `Ready`,
/// in which case the wait was resolved and we leave it.
///
/// This is the fix that unblocks games (like Sylpheed) which gate their
/// main loop on a v-sync callback signaling an event the main thread
/// waits on. The earlier "only-when-Ready" policy dropped 397 of 399
/// observed v-syncs on a 1 B-instruction Sylpheed probe; now they
/// actually get delivered.
fn try_inject_graphics_interrupt(kernel: &mut xenia_kernel::KernelState) {
use xenia_cpu::scheduler::HwState;
if kernel.interrupts.is_in_callback() {
return;
}
let Some(cb) = kernel.interrupts.callback else {
// No callback registered; drain any pending entries (they
// wouldn't have made it into the queue per `queue_interrupt`'s
// own `callback.is_none()` guard, but be defensive).
kernel.interrupts.pending.clear();
return;
};
let Some(source) = kernel.interrupts.peek_next() else {
return;
};
// Canary's `EmulateCPInterruptDPC` (kernel_state.cc:1373) dispatches on
// whatever the current thread happens to be — real hardware fires the
// interrupt on CPU 2 and the kernel impersonates a DPC on top of
// whichever thread is active. Hard-anchoring to HW 0 breaks the moment
// `main()` returns: Sylpheed's main thread exits right after init, the
// render worker spins on a `PKEVENT` inside the interrupt callback's
// user_data struct (`user_data + 0x5C`), and because HW 0 is now
// `Exited(_)` our injector drops every subsequent vsync — the PKEVENT
// is never signaled and the worker polls forever.
//
// Pick the first HW thread we can plausibly run the callback on:
// 1. Prefer `Ready` (no state-mangling needed)
// 2. Else take a `Blocked(reason)` thread and swap to
// `ServicingIrq(reason)` so the round scheduler runs it; the
// LR-sentinel restore path reinstates the block on callback return
// 3. Skip `Idle`, `Exited`, or already-`ServicingIrq` slots
//
// The callback itself just signals a game-side event and returns — it
// doesn't care which HW thread it ran on.
// Pass 1: find any Ready thread across all slots.
let mut victim: Option<xenia_cpu::ThreadRef> = None;
'outer_ready: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
for (idx, t) in slot.runqueue.iter().enumerate() {
if matches!(t.state, HwState::Ready) {
victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16));
break 'outer_ready;
}
}
}
// Pass 2: any Blocked thread (we'll flip it to ServicingIrq).
if victim.is_none() {
'outer_blocked: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
for (idx, t) in slot.runqueue.iter().enumerate() {
if matches!(t.state, HwState::Blocked(_)) {
victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16));
break 'outer_blocked;
}
}
}
}
let Some(target_ref) = victim else {
// All threads Idle/Exited/already servicing — nothing to inject on.
kernel.interrupts.take_next();
kernel.interrupts.dropped += 1;
return;
};
let t = kernel.scheduler.thread_mut(target_ref);
let prev_state = t.state.clone();
match prev_state {
HwState::Ready => {}
HwState::Blocked(reason) => {
t.state = HwState::ServicingIrq(reason);
}
_ => unreachable!("victim selection above filtered out other variants"),
}
let _ = kernel.interrupts.take_next();
let t = kernel.scheduler.thread_mut(target_ref);
let saved = xenia_kernel::SavedCallbackCtx::capture(&t.ctx, source);
kernel.interrupts.injected_ref = Some(target_ref);
t.ctx.pc = cb.callback_pc;
t.ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
// Canary `Processor::Execute` decrements the guest SP by 176 before
// running the callback and restores on return (see Canary
// processor.cc:383). Without this pad the callback's
// `__savegprlr_N` prologue stomps the interrupted function's
// already-saved LR at [r1-8], so when the interrupted function
// later returns via `__restgprlr_N -> bclr` it jumps to
// `LR_HALT_SENTINEL` and the thread exits prematurely. Matching
// restore lives in `SavedCallbackCtx::restore` (which now also
// restores r1).
t.ctx.gpr[1] = t
.ctx
.gpr[1]
.wrapping_sub(xenia_kernel::interrupts::CALLBACK_STACK_PAD as u64);
t.ctx.gpr[3] = source as u64;
t.ctx.gpr[4] = cb.user_data as u64;
kernel.interrupts.saved = Some(saved);
metrics::counter!("gpu.interrupt.delivered", "source" => format!("{source}"))
.increment(1);
tracing::debug!(
source,
hw_id = target_ref.hw_id,
idx = target_ref.idx,
callback = format_args!("{:#010x}", cb.callback_pc),
"graphics interrupt: injecting"
);
}
/// APUBUG-PRODUCER-001 — inject a pending XAudio buffer-complete callback
/// into the next available HW thread. Mirrors
/// [`try_inject_graphics_interrupt`] (same victim-selection policy, same
/// SP-pad, same saved-context restore-on-sentinel) but pulls the callback
/// PC + r3 argument from a registered [`xenia_kernel::xaudio::XAudioClient`]
/// instead of the graphics-callback registration.
///
/// Mutual exclusion with the graphics path is via the shared
/// `interrupts.saved` slot — if a graphics callback is already in flight,
/// `is_in_callback()` returns true and we bail until it returns to the
/// `LR_HALT_SENTINEL`.
fn try_inject_audio_callback(kernel: &mut xenia_kernel::KernelState) {
use xenia_cpu::scheduler::HwState;
if kernel.interrupts.is_in_callback() {
return;
}
let Some(index) = kernel.xaudio.peek_next() else {
return;
};
let Some(client) = kernel.xaudio.get(index) else {
let _ = kernel.xaudio.take_next();
kernel.xaudio.dropped += 1;
return;
};
let mut victim: Option<xenia_cpu::ThreadRef> = None;
'outer_ready: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
for (idx, t) in slot.runqueue.iter().enumerate() {
if matches!(t.state, HwState::Ready) {
victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16));
break 'outer_ready;
}
}
}
if victim.is_none() {
'outer_blocked: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
for (idx, t) in slot.runqueue.iter().enumerate() {
if matches!(t.state, HwState::Blocked(_)) {
victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16));
break 'outer_blocked;
}
}
}
}
let Some(target_ref) = victim else {
let _ = kernel.xaudio.take_next();
kernel.xaudio.dropped += 1;
return;
};
let t = kernel.scheduler.thread_mut(target_ref);
let prev_state = t.state.clone();
match prev_state {
HwState::Ready => {}
HwState::Blocked(reason) => {
t.state = HwState::ServicingIrq(reason);
}
_ => unreachable!("victim selection above filtered out other variants"),
}
let _ = kernel.xaudio.take_next();
let t = kernel.scheduler.thread_mut(target_ref);
let saved = xenia_kernel::SavedCallbackCtx::capture(
&t.ctx,
xenia_kernel::INTERRUPT_SOURCE_AUDIO,
);
kernel.interrupts.injected_ref = Some(target_ref);
t.ctx.pc = client.callback_pc;
t.ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL;
t.ctx.gpr[1] = t
.ctx
.gpr[1]
.wrapping_sub(xenia_kernel::interrupts::CALLBACK_STACK_PAD as u64);
t.ctx.gpr[3] = client.wrapped_callback_arg as u64;
kernel.interrupts.saved = Some(saved);
kernel.xaudio.delivered += 1;
metrics::counter!("xaudio.callback.delivered").increment(1);
tracing::debug!(
index,
hw_id = target_ref.hw_id,
idx = target_ref.idx,
callback = format_args!("{:#010x}", client.callback_pc),
wrapped = format_args!("{:#010x}", client.wrapped_callback_arg),
"xaudio callback: injecting"
);
}
fn print_summary(
ctx: &xenia_cpu::PpcContext,
debugger: &xenia_debugger::Debugger,
db_writer: &Option<xenia_analysis::DbWriter>,
quiet: bool,
) {
if !quiet {
println!("\n=== Final State ===");
println!("PC: {:#010x}", ctx.pc);
println!("LR: {:#010x}", ctx.lr as u32);
println!("CTR: {:#010x}", ctx.ctr as u32);
println!("CR: {:#010x}", ctx.cr());
println!("XER: CA={} OV={} SO={}", ctx.xer_ca, ctx.xer_ov, ctx.xer_so);
for i in 0..32 {
if ctx.gpr[i] != 0 {
println!("r{:<2}: {:#018x}", i, ctx.gpr[i]);
}
}
}
if db_writer.is_none() {
info!(entries = debugger.trace_log.len(), "in-memory trace log");
}
}
/// Dump a per-thread + per-handle diagnostic at end-of-run. Lets the next
/// session see *which* thread is stuck *where* without having to re-derive
/// it from ring-buffer / event-count math. Mirrors the halt-on-deadlock
/// dump in `run_execution` but runs on the normal `-n` exit path too.
fn dump_thread_diagnostic(kernel: &xenia_kernel::KernelState, quiet: bool) {
if quiet {
return;
}
use xenia_kernel::objects::KernelObject;
println!("\n=== Thread diagnostics ===");
for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
if slot.runqueue.is_empty() {
println!(" hw={} (empty)", hw_id);
continue;
}
for (idx, t) in slot.runqueue.iter().enumerate() {
println!(
" hw={} idx={} tid={} state={:?} pc={:#010x} lr={:#010x} sp={:#010x}",
hw_id,
idx,
t.tid,
t.state,
t.ctx.pc,
t.ctx.lr as u32,
t.ctx.gpr[1] as u32,
);
println!(
" r0={:#010x} r3={:#010x} r4={:#010x} r5={:#010x} r6={:#010x} r7={:#010x}",
t.ctx.gpr[0] as u32,
t.ctx.gpr[3] as u32,
t.ctx.gpr[4] as u32,
t.ctx.gpr[5] as u32,
t.ctx.gpr[6] as u32,
t.ctx.gpr[7] as u32,
);
println!(
" r8={:#010x} r9={:#010x} r10={:#010x} r11={:#010x} r12={:#010x} r13={:#010x}",
t.ctx.gpr[8] as u32,
t.ctx.gpr[9] as u32,
t.ctx.gpr[10] as u32,
t.ctx.gpr[11] as u32,
t.ctx.gpr[12] as u32,
t.ctx.gpr[13] as u32,
);
}
}
let tid_of = |r: &xenia_cpu::ThreadRef| -> u32 {
kernel
.scheduler
.slots
.get(r.hw_id as usize)
.and_then(|s| s.runqueue.get(r.idx as usize))
.map(|t| t.tid)
.unwrap_or(0)
};
let mut any_obj = false;
for (h, obj) in kernel.objects.iter() {
let (kind, waiters) = match obj {
KernelObject::Event { signaled, manual_reset, waiters } => {
(format!("Event(sig={}, mr={})", signaled, manual_reset), waiters)
}
KernelObject::Semaphore { count, max, waiters } => {
(format!("Semaphore({}/{})", count, max), waiters)
}
KernelObject::Thread { id, exit_code, waiters, .. } => {
(format!("Thread(id={}, exit={:?})", id, exit_code), waiters)
}
KernelObject::Timer { signaled, deadline, waiters, .. } => {
(format!("Timer(sig={}, deadline={:?})", signaled, deadline), waiters)
}
KernelObject::Mutex { owner, recursion, waiters } => {
(format!("Mutex(owner={:?}, rec={})", owner, recursion), waiters)
}
KernelObject::File { .. } => continue,
};
if waiters.is_empty() {
continue;
}
if !any_obj {
println!("\n -- Handle waiter lists --");
any_obj = true;
}
let tids: Vec<u32> = waiters.iter().map(&tid_of).collect();
println!(" handle={:#010x} {} waiters(tid)={:?}", h, kind, tids);
}
if !kernel.cs_waiters.is_empty() {
let mut any_cs = false;
for (cs_ptr, waiters) in kernel.cs_waiters.iter() {
if waiters.is_empty() {
continue;
}
if !any_cs {
println!("\n -- Critical section waiters --");
any_cs = true;
}
let tids: Vec<u32> = waiters.iter().map(&tid_of).collect();
println!(" cs={:#010x} waiters(tid)={:?}", cs_ptr, tids);
}
}
// Audit trails (only when --trace-handles flipped the flag). For each
// tracked handle, emit a compact block: kind, creator, and the bounded
// ring of signal/wait/wake events. Highlight handles with waiters and
// zero signals — the smoking gun for a missing kernel API.
if kernel.audit.enabled {
println!("\n=== Handle audit trails ===");
// Suspect set from project memory: handles stuck on Sylpheed boot.
const SUSPECT: [u32; 5] = [0x10FC, 0x1014, 0x1104, 0x10DC, 0x10F0];
// Sort by handle for stable output.
let mut handles: Vec<u32> = kernel.audit.trails.keys().copied().collect();
handles.sort();
for h in handles {
let trail = &kernel.audit.trails[&h];
let waiter_count = kernel
.objects
.get(&h)
.and_then(|o| match o {
KernelObject::Event { waiters, .. }
| KernelObject::Semaphore { waiters, .. }
| KernelObject::Thread { waiters, .. }
| KernelObject::Timer { waiters, .. }
| KernelObject::Mutex { waiters, .. } => Some(waiters.len()),
KernelObject::File { .. } => None,
})
.unwrap_or(0);
let suspect = SUSPECT.contains(&h);
let interesting = waiter_count > 0
|| trail.signals.is_empty() && !trail.waits.is_empty()
|| suspect;
if !interesting {
continue;
}
let marker = if suspect { " <SUSPECT>" } else { "" };
let smoking = if !trail.waits.is_empty() && trail.signals.is_empty() {
" <NO_SIGNALS_DESPITE_WAITS>"
} else {
""
};
println!(
" handle={:#010x} kind={} waiters={} signals={} waits={} wakes={}{}{}",
h,
trail.kind,
waiter_count,
trail.signals.len(),
trail.waits.len(),
trail.wakes.len(),
marker,
smoking,
);
println!(
" created cycle={} tid={} lr={:#010x} src={}",
trail.created.cycle, trail.created.tid, trail.created.lr, trail.created.source,
);
// Print the first few of each ring — full dump can be re-run with
// a larger AUDIT_RING_CAPACITY if needed.
let head = 4usize;
for (label, ring) in [
("waits", &trail.waits),
("signals", &trail.signals),
("wakes", &trail.wakes),
] {
if ring.is_empty() {
continue;
}
println!(" {} ({}):", label, ring.len());
for (i, e) in ring.iter().take(head).enumerate() {
println!(
" [{:>2}] cycle={} tid={} lr={:#010x} src={} aux={:#x}",
i, e.cycle, e.tid, e.lr, e.source, e.aux,
);
}
if ring.len() > head {
println!(" ... ({} more)", ring.len() - head);
}
}
}
// KRNBUG-AUDIT-001: per-handle FOCUS report. For each handle in
// `--trace-handles-focus`, emit a unified diagnosis: how many
// signal attempts (primary + ghost) we saw, classified by
// producer-class (GuestExport vs KernelInternal vs
// InterruptCallback), so we can deterministically distinguish:
// - "guest never tried" → producer is a missing kernel signal source
// - "signal landed but waiter wasn't woken" → wake-eligibility bug
if !kernel.audit.focus.is_empty() {
println!("\n=== Handle audit (focus) ===");
let mut focus: Vec<u32> = kernel.audit.focus.iter().copied().collect();
focus.sort();
for h in focus {
let waiter_count = kernel
.objects
.get(&h)
.and_then(|o| match o {
KernelObject::Event { waiters, .. }
| KernelObject::Semaphore { waiters, .. }
| KernelObject::Thread { waiters, .. }
| KernelObject::Timer { waiters, .. }
| KernelObject::Mutex { waiters, .. } => Some(waiters.len()),
KernelObject::File { .. } => None,
})
.unwrap_or(0);
let signaled = kernel
.objects
.get(&h)
.and_then(|o| match o {
KernelObject::Event { signaled, .. } => Some(*signaled),
_ => None,
});
let primary = kernel.audit.trails.get(&h);
let ghost = kernel.audit.ghost_trails.get(&h);
let kind_str = primary.map(|t| t.kind).unwrap_or("<UNCREATED>");
let primary_signals = primary.map(|t| t.signals.len()).unwrap_or(0);
let ghost_signals = ghost.map(|g| g.signals.len()).unwrap_or(0);
let total_signals = primary_signals + ghost_signals;
let primary_waits = primary.map(|t| t.waits.len()).unwrap_or(0);
let primary_wakes = primary.map(|t| t.wakes.len()).unwrap_or(0);
let audit_blind =
waiter_count > 0 && primary_waits == 0 && primary_wakes == 0;
println!(
" handle={:#010x} kind={} waiters={} signaled={} signal_attempts={} (primary={}, ghost={}) waits={} wakes={}{}",
h,
kind_str,
waiter_count,
signaled
.map(|s| if s { "true" } else { "false" })
.unwrap_or("?"),
total_signals,
primary_signals,
ghost_signals,
primary_waits,
primary_wakes,
if audit_blind { " <AUDIT_BLIND>" } else { "" },
);
if let Some(t) = primary {
println!(
" created cycle={} tid={} lr={:#010x} src={}",
t.created.cycle, t.created.tid, t.created.lr, t.created.source,
);
}
// Producer-class classification. Source strings are stable
// labels passed to `record_signal` at the export sites.
// Anything not in the GuestExport set is treated as
// KernelInternal (e.g. signal_io_completion_event).
let guest_export_sources = [
"KeSetEvent",
"NtSetEvent",
"KePulseEvent",
"NtPulseEvent",
"KeReleaseSemaphore",
"NtReleaseSemaphore",
"NtSignalAndWaitForSingleObjectEx",
];
let mut guest_export = 0usize;
let mut kernel_internal = 0usize;
let count_signals = |signals: &std::collections::VecDeque<
xenia_kernel::audit::HandleAuditEntry,
>,
guest_export: &mut usize,
kernel_internal: &mut usize| {
for e in signals {
if guest_export_sources.contains(&e.source) {
*guest_export += 1;
} else {
*kernel_internal += 1;
}
}
};
if let Some(p) = primary {
count_signals(&p.signals, &mut guest_export, &mut kernel_internal);
}
if let Some(g) = ghost {
count_signals(&g.signals, &mut guest_export, &mut kernel_internal);
}
// Compact merged timeline (cycle-sorted) of waits + signals.
let mut tl: Vec<(u64, &str, &xenia_kernel::audit::HandleAuditEntry)> = Vec::new();
if let Some(p) = primary {
for e in &p.waits {
tl.push((e.cycle, "wait", e));
}
for e in &p.signals {
tl.push((e.cycle, "signal", e));
}
}
if let Some(g) = ghost {
for e in &g.signals {
tl.push((e.cycle, "signal[ghost]", e));
}
}
tl.sort_by_key(|(c, _, _)| *c);
if !tl.is_empty() {
println!(" timeline (last 16 of {}):", tl.len());
let start = tl.len().saturating_sub(16);
for (cycle, label, e) in &tl[start..] {
println!(
" cycle={} tid={} lr={:#010x} src={}[{}] aux={:#x}",
cycle, e.tid, e.lr, e.source, label, e.aux,
);
}
}
// -- DIAGNOSIS --
println!(" -- DIAGNOSIS --");
println!(
" GuestExport={} KernelInternal={} waits={}",
guest_export, kernel_internal, primary_waits,
);
let conclusion = if total_signals == 0 && audit_blind {
"waiter parked via non-audited path (CS / spinlock / DPC / sync-primitive helper) and no signals — instrument that wait path or check ObReferenceObjectByHandle plumbing"
} else if total_signals == 0 && primary_waits > 0 {
"producer is a missing kernel signal source (or BST-paradox upstream)"
} else if total_signals == 0 && primary_waits == 0 {
"no waits and no signals — handle may not be reached during this run"
} else if guest_export > 0 && waiter_count > 0 {
"signals landed but waiter still parked — wake-eligibility / mr-bit bug"
} else if kernel_internal > 0 && waiter_count > 0 {
"kernel-internal signal landed but waiter still parked — wake or routing bug"
} else {
"not stuck — signals consumed correctly"
};
println!(" => {}", conclusion);
}
}
}
}
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(title))]
fn run_with_ui(
title: &str,
mut mem: xenia_memory::GuestMemory,
mut kernel: xenia_kernel::KernelState,
mut debugger: xenia_debugger::Debugger,
thunk_map: HashMap<u32, (ModuleId, u16, String)>,
mut db_writer: Option<xenia_analysis::DbWriter>,
max_instructions: Option<u64>,
ips_limit: Option<u64>,
quiet: bool,
halt_on_deadlock: bool,
started: Instant,
) -> Result<()> {
use winit::event_loop::EventLoop;
let event_loop = EventLoop::<xenia_ui::bridge::SwapEvent>::with_user_event()
.build()
.map_err(|e| anyhow::anyhow!("winit event loop build failed: {e}"))?;
let (ui_handles, kernel_bridge) = xenia_ui::build(event_loop.create_proxy());
kernel.ui = Some(kernel_bridge);
let shutdown = std::sync::Arc::clone(&ui_handles.shutdown);
let title_owned = std::path::Path::new(title)
.file_name()
.and_then(|n| n.to_str())
.unwrap_or(title)
.to_string();
let worker_span = tracing::info_span!("cpu_worker");
let worker = std::thread::Builder::new()
.name("xenia-cpu".into())
.spawn(move || -> Result<(ExecStats, xenia_kernel::KernelState, xenia_debugger::Debugger, Option<xenia_analysis::DbWriter>)> {
let _guard = worker_span.enter();
let stats = run_execution(
&mut mem,
&mut kernel,
&mut debugger,
&thunk_map,
db_writer.as_mut(),
max_instructions,
ips_limit,
quiet,
halt_on_deadlock,
Some(shutdown),
);
if let Some(ref mut db) = db_writer {
db.finalize_traces()?;
}
Ok((stats, kernel, debugger, db_writer))
})
.map_err(|e| anyhow::anyhow!("spawn CPU worker: {e}"))?;
// Run the UI — blocks until the user closes the window or the worker
// flips the shutdown flag itself (e.g. after max_instructions).
xenia_ui::run(event_loop, ui_handles, &title_owned)?;
let (stats, kernel, debugger, db_writer) = match worker.join() {
Ok(res) => res?,
Err(_) => {
return Err(anyhow::anyhow!("CPU worker thread panicked"));
}
};
print_summary(kernel.scheduler.ctx(0), &debugger, &db_writer, quiet);
dump_thread_diagnostic(&kernel, quiet);
info!(
wall_ms = started.elapsed().as_millis() as u64,
instructions = stats.instruction_count,
import_calls = stats.import_count,
unimplemented = stats.unimpl_count,
"exec --ui complete"
);
Ok(())
}
#[instrument(skip_all, fields(path = %path))]
fn cmd_browse(path: &str) -> Result<()> {
use xenia_vfs::VfsDevice;
let disc = xenia_vfs::disc_image::DiscImageDevice::open("disc", std::path::Path::new(path))
.map_err(|e| anyhow::anyhow!("Failed to open disc image: {}", e))?;
println!("=== XISO Contents: {} ===", path);
match disc.list_root() {
Ok(entries) => {
for entry in entries {
let kind = if entry.is_directory { "DIR " } else { "FILE" };
println!(" {} {:>10} {}", kind, entry.size, entry.name);
}
}
Err(e) => tracing::error!(%e, "error listing contents"),
}
Ok(())
}
/// Helper: load XEX, parse header, decompress PE, resolve imports, parse sections.
#[instrument(skip_all, fields(path = %path))]
fn load_and_prepare(path: &str) -> Result<(xenia_xex::Xex2Header, Vec<u8>, Vec<xenia_xex::pe::PeSection>)> {
let data = load_xex_data(path)?;
let mut header = xenia_xex::loader::parse_xex2_header(&data)?;
let entry = xenia_xex::loader::get_entry_point(&header)
.ok_or_else(|| anyhow::anyhow!("No entry point found in XEX2 header"))?;
let base = xenia_xex::loader::get_image_base(&header)
.ok_or_else(|| anyhow::anyhow!("No image base found in XEX2 header"))?;
info!(
entry = format_args!("{:#010x}", entry),
base = format_args!("{:#010x}", base),
"XEX entry/base"
);
let pe_image = xenia_xex::loader::load_image(&data, &header)?;
info!(bytes = pe_image.len(), "image decompressed");
// Resolve import ordinals and record types from the PE image
xenia_xex::loader::resolve_imports(&mut header, &pe_image);
// Parse PE sections
let sections = xenia_xex::pe::parse_sections(&pe_image)?;
info!(sections = sections.len(), "parsed PE sections");
Ok((header, pe_image, sections))
}
#[instrument(skip_all, fields(path = %path))]
fn cmd_extract(path: &str, output_dir: Option<&str>, db_path: Option<&str>) -> Result<()> {
use serde::Serialize;
let (header, pe_image, sections) = load_and_prepare(path)?;
let entry = xenia_xex::loader::get_entry_point(&header).unwrap();
let base = xenia_xex::loader::get_image_base(&header).unwrap();
let image_size = header.security_info.as_ref().map(|s| s.image_size).unwrap_or(0);
// Build JSON-serializable info struct
#[derive(Serialize)]
struct Xex2Info<'a> {
module_flags: u32,
image_base: u32,
entry_point: u32,
image_size: u32,
original_pe_name: Option<&'a str>,
execution_info: &'a Option<xenia_xex::header::ExecutionInfo>,
import_libraries: &'a [xenia_xex::header::ImportLibrary],
sections: &'a [xenia_xex::pe::PeSection],
}
let info = Xex2Info {
module_flags: header.module_flags,
image_base: base,
entry_point: entry,
image_size,
original_pe_name: header.original_pe_name.as_deref(),
execution_info: &header.execution_info,
import_libraries: &header.import_libraries,
sections: &sections,
};
// Determine output directory
let input_path = std::path::Path::new(path);
let out_dir = match output_dir {
Some(d) => std::path::PathBuf::from(d),
None => input_path.parent().unwrap_or(std::path::Path::new(".")).to_path_buf(),
};
std::fs::create_dir_all(&out_dir)?;
let stem = input_path.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("output");
// Write PE image
let pe_path = out_dir.join(format!("{stem}.pe"));
std::fs::write(&pe_path, &pe_image)?;
info!(
path = %pe_path.display(),
bytes = pe_image.len(),
"wrote PE image"
);
// Write JSON metadata
let json_path = out_dir.join(format!("{stem}.xex.json"));
let json = serde_json::to_string_pretty(&info)?;
std::fs::write(&json_path, &json)?;
info!(path = %json_path.display(), "wrote metadata JSON");
// Print summary
let total_imports: usize = header.import_libraries.iter().map(|l| l.imports.len()).sum();
println!("Extracted: {} sections, {} import libraries ({} imports)",
sections.len(), header.import_libraries.len(), total_imports);
if let Some(ref ei) = header.execution_info {
println!("Title ID: 0x{:08X} Media ID: 0x{:08X}", ei.title_id, ei.media_id);
}
// Write base tables to SQLite if requested
if let Some(db) = db_path {
let disasm_info = xenia_analysis::formatter::DisasmInfo {
image_base: base,
entry_point: entry,
original_pe_name: header.original_pe_name.as_deref(),
title_id: header.execution_info.as_ref().map(|e| e.title_id),
media_id: header.execution_info.as_ref().map(|e| e.media_id),
sections: &sections,
import_libraries: &header.import_libraries,
};
info!(db = %db, "writing base tables");
let mut w = xenia_analysis::DbWriter::open_fresh(std::path::Path::new(db))?;
w.write_base(&disasm_info)?;
info!(db = %db, "database written");
}
Ok(())
}
#[instrument(skip_all, fields(path = %path))]
fn cmd_dis(
path: &str,
output: Option<&str>,
db_path: Option<&str>,
json_path: Option<&str>,
analyze: AnalyzeMode,
quiet: bool,
) -> Result<()> {
use std::collections::HashMap;
let started = Instant::now();
let (header, pe_image, sections) = load_and_prepare(path)?;
let entry = xenia_xex::loader::get_entry_point(&header).unwrap();
let base = xenia_xex::loader::get_image_base(&header).unwrap();
// Build import address -> name map
let mut import_map: HashMap<u32, String> = HashMap::new();
for lib in &header.import_libraries {
for imp in &lib.imports {
let resolved = xenia_analysis::resolve_ordinal(&lib.name, imp.ordinal);
let name = match resolved {
Some(n) => format!("{}::{}", lib.name, n),
None => format!("{}::ordinal_{:#06X}", lib.name, imp.ordinal),
};
import_map.insert(imp.address, name);
}
}
info!(thunks = import_map.len(), "resolved import thunks");
// Function analysis
let code_sections: Vec<(u32, u32, u32)> = sections.iter()
.filter(|s| s.is_code())
.map(|s| (s.virtual_address, s.virtual_size, s.flags))
.collect();
let func_analysis = xenia_analysis::func::analyze(&pe_image, base, entry, &code_sections);
info!(functions = func_analysis.functions.len(), "function detection complete");
// Cross-reference analysis
let xref_result = xenia_analysis::xref::analyze_xrefs(
&pe_image, base, entry, &sections, &func_analysis, &import_map,
);
let total_xrefs: usize = xref_result.xrefs.values().map(|v| v.len()).sum();
info!(
labels = xref_result.labels.len(),
xrefs = total_xrefs,
"xref analysis complete"
);
// Build DisasmInfo
let disasm_info = xenia_analysis::formatter::DisasmInfo {
image_base: base,
entry_point: entry,
original_pe_name: header.original_pe_name.as_deref(),
title_id: header.execution_info.as_ref().map(|e| e.title_id),
media_id: header.execution_info.as_ref().map(|e| e.media_id),
sections: &sections,
import_libraries: &header.import_libraries,
};
// SQLite database output (base + ingest + analyze layers)
if let Some(db) = db_path {
info!(db = %db, analyze = ?analyze, "writing database");
let mut w = xenia_analysis::DbWriter::open_fresh(std::path::Path::new(db))?;
w.write_base(&disasm_info)?;
w.ingest_instructions(&pe_image, &disasm_info, &func_analysis, &xref_result.labels)?;
w.write_analysis_results(
&pe_image,
&disasm_info,
&func_analysis,
&xref_result.labels,
&xref_result.xrefs,
)?;
if matches!(analyze, AnalyzeMode::Sql | AnalyzeMode::Both) {
w.create_sql_views()?;
info!(db = %db, "SQL views created");
}
if matches!(analyze, AnalyzeMode::Both) {
let (sql_only, rust_only) = w.cross_check_branch_xrefs()?;
if sql_only == 0 && rust_only == 0 {
info!(db = %db, "Rust/SQL branch xrefs agree");
} else {
tracing::warn!(
db = %db,
sql_only,
rust_only,
"Rust/SQL branch xref disagreement — investigate formatter mnemonic vs xref.rs kind classification"
);
}
}
info!(db = %db, "database written");
}
// JSON Lines output: one row per instruction, structured columns.
if let Some(json) = json_path {
info!(json = %json, "writing JSON Lines");
let mut out = std::io::BufWriter::new(std::fs::File::create(json)?);
let mut total: u64 = 0;
for section in &sections {
if !section.is_code() { continue; }
let abs_start = base + section.virtual_address;
let abs_end = abs_start + section.virtual_size;
let items = xenia_analysis::enrich_section(
&pe_image, base, &section.name, abs_start, abs_end,
&func_analysis, &xref_result.labels,
);
total += xenia_analysis::sinks::json::write_jsonl(&mut out, items)?;
}
info!(json = %json, rows = total, "JSON Lines written");
}
// Assembly output (skipped when --quiet and no --output specified)
if !quiet || output.is_some() {
let mut out: Box<dyn std::io::Write> = match output {
Some(path) => Box::new(std::io::BufWriter::new(std::fs::File::create(path)?)),
None => Box::new(std::io::BufWriter::new(std::io::stdout().lock())),
};
xenia_analysis::formatter::write_asm(
&mut *out,
&pe_image,
&disasm_info,
&func_analysis,
&xref_result.labels,
&import_map,
&xref_result.xrefs,
&xref_result.data_annotations,
)?;
if let Some(path) = output {
info!(path, "wrote disassembly");
}
}
info!(wall_ms = started.elapsed().as_millis() as u64, "dis complete");
Ok(())
}
#[cfg(test)]
mod tests {
use super::parse_hex_u32;
#[test]
fn parse_hex_u32_accepts_0x_prefix() {
assert_eq!(parse_hex_u32("0x824be9a0").unwrap(), 0x824be9a0);
assert_eq!(parse_hex_u32("0X82000000").unwrap(), 0x82000000);
}
#[test]
fn parse_hex_u32_accepts_bare_hex() {
// No 0x prefix, contains hex letters — treated as hex.
assert_eq!(parse_hex_u32("824be9a0").unwrap(), 0x824be9a0);
}
#[test]
fn parse_hex_u32_accepts_decimal() {
// All digits, no 0x — treated as decimal.
assert_eq!(parse_hex_u32("1000").unwrap(), 1000);
assert_eq!(parse_hex_u32("0").unwrap(), 0);
}
#[test]
fn parse_hex_u32_rejects_garbage() {
assert!(parse_hex_u32("not a number").is_err());
assert!(parse_hex_u32("0xZZZ").is_err());
}
#[test]
fn parse_hex_u32_tolerates_whitespace() {
assert_eq!(parse_hex_u32(" 0x82000000 ").unwrap(), 0x82000000);
}
}