mod observability; use std::collections::HashMap; use std::path::PathBuf; use std::time::Instant; use anyhow::Result; use clap::{Parser, Subcommand, ValueEnum}; use tracing::{debug, info, instrument, warn}; use xenia_kernel::ModuleId; use xenia_memory::MemoryAccess; #[derive(Parser)] #[command(name = "xenia-rs")] #[command(about = "Xbox 360 emulator for reverse engineering and preservation")] struct Cli { /// Force JSON formatting on the console log layer (default is compact text) #[arg(long, global = true)] log_json: bool, /// Additionally write logs to this file. `.json` extension selects JSON; /// anything else uses text. Uses a non-blocking appender. #[arg(long, global = true, value_name = "PATH")] log_file: Option, /// Override tracing filter (same syntax as `RUST_LOG`). Also settable via env. #[arg(long, global = true, value_name = "DIRECTIVES")] log_filter: Option, /// Emit a Chrome `about:tracing` JSON trace of all spans to this path. #[arg(long, global = true, value_name = "PATH")] trace_chrome: Option, /// Start the pprof sampling profiler at 100 Hz. Extension `.svg` writes a /// flamegraph; `.pb` writes a pprof protobuf. Requires the `profiling` /// Cargo feature (enabled by default). #[arg(long, global = true, value_name = "PATH")] profile: Option, #[command(subcommand)] command: Commands, } /// `dis --analyze` modes. Rust passes are the default and always functional; /// SQL views are an additive surface for relational queries. #[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] enum AnalyzeMode { /// Rust passes only (`func.rs`, `xref.rs`). No SQL views. Rust, /// Rust passes + additive SQL views (`v_branch_xrefs`, `v_call_graph`, /// `v_reachability_from_entry`, `v_function_first_instruction`, /// `v_imports_called`). Sql, /// Same as `sql`, plus a Rust-vs-SQL cross-check on branch xrefs at /// the end. Disagreement is logged as a warning (non-fatal). Both, } #[derive(Subcommand)] enum Commands { /// Disassemble a XEX file from its entry point (or an arbitrary address via `--at`) Disasm { /// Path to XEX file path: String, /// Number of instructions to disassemble #[arg(short = 'n', default_value = "64")] count: usize, /// Start address (hex with or without `0x` prefix). Defaults to /// the XEX entry point. Must fall inside the loaded image range. /// /// Example: `--at 0x824be9a0` to inspect a graphics-interrupt callback. #[arg(long, value_parser = parse_hex_u32)] at: Option, }, /// Load and execute a XEX file with tracing Exec { /// Path to XEX file path: String, /// Maximum instructions to execute before stopping (unlimited if omitted) #[arg(short = 'n')] max_instructions: Option, /// Throttle execution to at most N instructions per second (unlimited if omitted) #[arg(long)] ips_limit: Option, /// SQLite database to write to. Includes the full static analysis /// that `dis --db` would produce, plus any opt-in trace tables. #[arg(long)] db: Option, /// Log each executed instruction to the `exec_trace` table #[arg(long)] trace_instructions: bool, /// Log kernel/import calls to the `import_calls` table #[arg(long)] trace_imports: bool, /// Log taken branches (calls, returns, jumps) to the `branch_trace` table #[arg(long)] trace_branches: bool, /// Suppress banners, kernel-call logs, and final register dump /// (only errors, faults, halts, and the summary line are printed) #[arg(long)] quiet: bool, /// Open a window (winit + wgpu) for interactive dynamic analysis: /// presents the guest frontbuffer on each VdSwap, polls a host /// gamepad (gilrs) into XamInputGetState, and renders a live HUD /// with swap/input stats. Runs the CPU interpreter on a worker /// thread; the window hosts the event loop on the main thread. #[arg(long)] ui: bool, /// Halt immediately when every live HW thread is blocked on a handle /// wait and no timer is pending, dumping a per-thread diagnostic /// (state / PC / LR / waiter handles) instead of force-waking the /// waiters with STATUS_TIMEOUT. Default is to force-wake so probe /// runs survive early deadlocks; flip this when investigating the /// deadlock itself. Also settable via `XENIA_HALT_ON_DEADLOCK=1`. #[arg(long)] halt_on_deadlock: bool, /// Enable per-handle signal/wait/wake audit trail. Diagnostic only: /// at end-of-run, `dump_thread_diagnostic` prints, for every kernel /// handle, the bounded ring of recent create/wait/signal/wake events /// (who fired what, when, with what status). Used to identify /// missing-signal deadlocks (a handle with waiters but zero signals /// is the smoking gun for a missing kernel API). Also settable via /// `XENIA_TRACE_HANDLES=1`. #[arg(long)] trace_handles: bool, /// Comma-separated list of handles to capture **signal attempts** /// for, even when no `record_create` exists. Distinguishes "guest /// never called Nt/KeSetEvent on this handle" (signal_attempts=0) /// from "signal landed but waiter wasn't woken" (>0). Implies /// `--trace-handles`. Examples: /// `--trace-handles-focus=0x1004,0x100c,0x15e4,0x42450b5c` /// `--trace-handles-focus=4100` (decimal; 0x1004) #[arg(long)] trace_handles_focus: Option, /// Force the threaded GPU backend (default at M1.9). Kept for /// explicit scripted runs; equivalent to omitting both /// `--gpu-inline` and `--ui`. Settable via `XENIA_GPU_THREAD=1`. #[arg(long)] gpu_thread: bool, /// **M1.9 rollback flag.** Forces the legacy inline GPU backend /// (synchronous `kernel.gpu.execute_one` on the CPU thread). /// Use this if the threaded backend regresses; it preserves /// pre-M1 behavior bit-for-bit. Settable via /// `XENIA_GPU_INLINE=1`. `--ui` implies `--gpu-inline` until the /// UI worker is migrated to the Arc-shared mem model. #[arg(long)] gpu_inline: bool, /// **M2.8 — enable the inter-thread reservation table for /// `lwarx`/`stwcx.`.** Under M3's per-HW-thread parallelism this /// is the substrate that mediates concurrent `lwarx` claims; in /// M2's single-threaded mode it's a no-op (the legacy /// per-`PpcContext` reservation fields drive observable /// behavior). Settable via `XENIA_RESERVATIONS_TABLE=1`. Default /// off; M3 flips to default-on for the parallel host-thread /// path. Verifies the golden digest is unchanged under lockstep /// mode with the table activated. #[arg(long)] reservations_table: bool, /// **M3 — spawn HW-thread host workers.** When set, wraps /// `KernelState` in `Arc>` and runs the interpreter /// loop on a dedicated worker thread (the coarse-locked first /// step toward per-HW-thread parallelism). Lockstep behavior /// preserved: the worker holds the lock for the whole run, so /// observable execution is identical. Settable via /// `XENIA_PARALLEL=1`. #[arg(long)] parallel: bool, /// **APUBUG-PRODUCER-001 diagnostic.** When set, the round /// prologue ticks the XAudio buffer-complete callback at the /// audio frame rate and injects via the same SavedCallbackCtx /// machinery as graphics interrupts. Default off — firing /// shifts the boot trajectory enough to regress `swaps=2→1` /// under Sylpheed, which would break the lockstep /// `sylpheed_n*m.json` goldens. Settable via /// `XENIA_XAUDIO_TICK=1`. #[arg(long)] xaudio_tick: bool, /// Diagnostic. Comma-separated list of guest PCs at which the /// worker prologue prints a one-shot `CTOR-PROBE` line capturing /// live r3/lr/sp + an 8-frame back-chain. Hit-once-per-hit — for /// the bridge ctor `sub_8217C850` called 8× from the /// static-init driver, the probe fires 8×. Use to recover /// pool-element `this` addresses that MSVC ctors fail to /// preserve in r31. Read-only; lockstep digest unaffected. /// Examples: `--ctor-probe=0x8217C850`, /// `--ctor-probe=0x82181750,0x821701C8`. KRNBUG-AUDIT-005: /// each token may be extended to `PC@DISPATCHER:OFFSET` (e.g. /// `0x821802D8@0x828F3D08:80`); on a hit the helper additionally /// logs the value of `[DISPATCHER+OFFSET]` — what the producer's /// `lwz r3, OFFSET(r3)` is about to read. The flag is also /// reachable as `--pc-probe` (semantically clearer name). #[arg(long, alias = "pc-probe")] ctor_probe: Option, /// Diagnostic. Comma-separated list of guest addresses to dump /// (64 bytes each, hex + u32 lanes) at end-of-run, after the /// per-handle FOCUS report. Used to inspect the static /// dispatcher / job-queue / pool struct layouts identified by /// AUDIT-003 (e.g. `--dump-addr=0x828F3D08,0x828F4070`). The /// dump captures a snapshot at the moment `dump_thread_diagnostic` /// runs — typically when `--halt-on-deadlock` triggers. #[arg(long)] dump_addr: Option, /// Diagnostic. Comma-separated list of guest PCs that emit a /// one-line `BRANCH-PROBE` record on each fire (pc, tid, hw, /// cycle, r3, lr, cr0/cr6 flags). Sister to `--pc-probe` but /// without the 8-frame back-chain, suited for tracing every /// conditional branch inside a candidate-gate function so the /// last PC reached before the function epilogue identifies the /// exit branch. Example: /// `--branch-probe=0x824a9710,0x824a9778,0x824a97dc,...`. /// Read-only; lockstep digest unaffected. Settable via /// `XENIA_BRANCH_PROBE`. #[arg(long)] branch_probe: Option, /// Diagnostic. Comma-separated guest byte addresses; on every /// guest store that overlaps any listed byte, emit one /// `MEM-WATCH` line at tracing target `mem_watch` with the /// (tid, pc, lr) of the writer plus old/new u32 lanes. /// Read-only; lockstep digest unaffected. Settable via /// `XENIA_MEM_WATCH`. Example: `--mem-watch=0x828F40B4`. #[arg(long)] mem_watch: Option, /// `--dump-section=BASE:LEN:PATH` end-of-run guest memory snapshot. #[arg(long)] dump_section: Option, }, /// Browse XISO disc image contents Browse { /// Path to XISO file path: String, }, /// Display XEX header information Info { /// Path to XEX file path: String, }, /// Extract PE image and metadata from a XEX file Extract { /// Path to XEX or ISO file path: String, /// Output directory (default: same directory as input) #[arg(short, long)] output: Option, /// Write base tables (metadata, sections, imports) to a SQLite database #[arg(long)] db: Option, }, /// Full disassembly with function detection, cross-references, and optional database Dis { /// Path to XEX or ISO file path: String, /// Output .asm file (default: stdout) #[arg(short, long)] output: Option, /// Output SQLite database (also includes the base extract tables) #[arg(long)] db: Option, /// Output JSON Lines file: one structured row per instruction with /// section/function/label/branch_target columns. Suitable for /// `jq`, pandas, or DuckDB's `read_json_auto`. #[arg(long)] json: Option, /// Choose how analysis tables are produced when `--db` is set. /// /// - `rust` (default): only the Rust passes (`func.rs`, `xref.rs`) /// populate `functions`/`labels`/`xrefs`. No SQL views. /// - `sql`: Rust passes still run (function detection and data-ref /// resolution are Rust-only by design); additive SQL views /// (`v_branch_xrefs`, `v_call_graph`, `v_reachability_from_entry`, /// `v_function_first_instruction`, `v_imports_called`) are /// created on top of the same tables. /// - `both`: same as `sql`, plus a Rust-vs-SQL cross-check on /// branch xrefs. Disagreement is logged as a warning (non-fatal). #[arg(long, value_enum, default_value_t = AnalyzeMode::Rust)] analyze: AnalyzeMode, /// Suppress assembly text output (DB-only mode) #[arg(long)] quiet: bool, }, /// P8 — deterministic run digest: execute `n` instructions and print a /// JSON summary of stats (instructions, draws, swaps, resolves, /// unique shaders/RTs, interrupts). Drop-in regression detector. Check { /// Path to XEX or ISO file path: String, /// Instructions to execute before computing the digest #[arg(short = 'n', default_value = "2000000")] max_instructions: u64, /// Write the digest as JSON to this path (stdout if omitted) #[arg(long)] out: Option, /// Optional golden digest JSON; `check` exits non-zero on mismatch #[arg(long)] expect: Option, /// Emit/compare only stable fields (excludes timing-sensitive counters /// like `packets`, `interrupts_delivered`, `resolves`). Required for any /// golden captured under `--parallel`; recommended for lockstep goldens /// at -n ≥50M because `packets` has empirical ±2–8% jitter from a GPU /// thread race. #[arg(long)] stable_digest: bool, /// Force the threaded GPU backend (default at M1.9). #[arg(long)] gpu_thread: bool, /// **M1.9 rollback flag.** Forces the legacy inline GPU backend. /// Use this if the threaded backend regresses. #[arg(long)] gpu_inline: bool, /// M2.8 — enable the inter-thread reservation table. #[arg(long)] reservations_table: bool, /// M3 — spawn HW-thread host workers under coarse Arc>. #[arg(long)] parallel: bool, /// APUBUG-PRODUCER-001 diagnostic — enable the XAudio buffer-complete /// callback ticker. Default off; see the `Exec --xaudio-tick` doc for /// rationale. #[arg(long)] xaudio_tick: bool, }, } fn main() -> Result<()> { let cli = Cli::parse(); // Bump default log level to `warn` for quiet exec runs so kernel-call // tracing::info! spam is filtered out. RUST_LOG / --log-filter still win. let exec_quiet = matches!(&cli.command, Commands::Exec { quiet: true, .. }); // Default filter: keep xenia-rs logs, silence noisy third-party crates. // `wgpu*` / `naga` emit many harmless WARNs (unrecognized present modes, // GLES-backend config probes, etc.) — we push them to ERROR by default. // Override via `--log-filter` or `RUST_LOG`, e.g. // `--log-filter='info,wgpu_core=trace'` during bring-up. let default_level: &'static str = if exec_quiet { "warn,\ wgpu=off,wgpu_core=off,wgpu_hal=off,naga=off,\ winit=off,gilrs=off,gilrs_core=off,\ mio=off,polling=off,sctk=off,smithay_client_toolkit=off,\ calloop=off" } else { "info,\ wgpu=error,wgpu_core=error,wgpu_hal=error,naga=error,\ winit=warn,gilrs=warn,gilrs_core=warn,\ mio=warn,polling=warn,sctk=warn,smithay_client_toolkit=warn,\ calloop=warn" }; let config = observability::ObservabilityConfig { log_json: cli.log_json, log_file: cli.log_file.clone(), log_filter: cli.log_filter.clone(), default_level, trace_chrome: cli.trace_chrome.clone(), profile: cli.profile.clone(), }; // Hold the guards for the whole program; their Drop flushes appenders, // finalises the Chrome trace, writes the pprof report, and prints the // metrics summary. let _obs = observability::init(&config)?; let result = match cli.command { Commands::Disasm { path, count, at } => cmd_disasm(&path, count, at), Commands::Exec { path, max_instructions, ips_limit, db, trace_instructions, trace_imports, trace_branches, quiet, ui, halt_on_deadlock, trace_handles, trace_handles_focus, gpu_thread, gpu_inline, reservations_table, parallel, xaudio_tick, ctor_probe, dump_addr, branch_probe, mem_watch, dump_section, } => cmd_exec( &path, max_instructions, ips_limit, db.as_deref(), trace_instructions, trace_imports, trace_branches, quiet, ui, halt_on_deadlock, trace_handles, trace_handles_focus.as_deref(), gpu_thread, gpu_inline, reservations_table, parallel, xaudio_tick, ctor_probe.as_deref(), dump_addr.as_deref(), branch_probe.as_deref(), mem_watch.as_deref(), dump_section.as_deref(), ), Commands::Browse { path } => cmd_browse(&path), Commands::Info { path } => cmd_info(&path), Commands::Extract { path, output, db } => cmd_extract(&path, output.as_deref(), db.as_deref()), Commands::Dis { path, output, db, json, analyze, quiet } => cmd_dis(&path, output.as_deref(), db.as_deref(), json.as_deref(), analyze, quiet), Commands::Check { path, max_instructions, out, expect, stable_digest, gpu_thread, gpu_inline, reservations_table, parallel, xaudio_tick, } => cmd_check( &path, max_instructions, out.as_deref(), expect.as_deref(), stable_digest, gpu_thread, gpu_inline, reservations_table, parallel, xaudio_tick, ), }; match result { Ok(()) => Ok(()), Err(e) => Err(observability::attach_span_trace(e)), } } /// Load XEX data from a path. If the path is an ISO, extract default.xex from it. #[instrument(skip_all, fields(path = %path))] fn load_xex_data(path: &str) -> Result> { let lower = path.to_lowercase(); if lower.ends_with(".iso") || lower.ends_with(".xiso") { use xenia_vfs::VfsDevice; info!("detected disc image, extracting default.xex"); let disc = xenia_vfs::disc_image::DiscImageDevice::open("disc", std::path::Path::new(path)) .map_err(|e| anyhow::anyhow!("Failed to open disc image: {}", e))?; disc.read_file("default.xex") .map_err(|e| anyhow::anyhow!("Failed to extract default.xex from disc image: {}", e)) } else { Ok(std::fs::read(path)?) } } #[instrument(skip_all, fields(path = %path))] fn cmd_info(path: &str) -> Result<()> { let started = Instant::now(); let data = load_xex_data(path)?; let header = xenia_xex::loader::parse_xex2_header(&data)?; println!("=== XEX2 Header ==="); println!("Magic: {:#010x}", header.magic); println!("Module Flags: {:#010x}", header.module_flags); println!("Header Size: {:#x}", header.header_size); println!("Headers: {}", header.header_count); if let Some(entry) = xenia_xex::loader::get_entry_point(&header) { println!("Entry Point: {:#010x}", entry); } if let Some(base) = xenia_xex::loader::get_image_base(&header) { println!("Image Base: {:#010x}", base); } println!("\n=== Optional Headers ==="); for h in &header.optional_headers { println!(" Key: {:#010x} Value: {:#010x}", h.key, h.value); } if let Some(ref sec) = header.security_info { println!("\n=== Security Info ==="); println!("Image Size: {:#x}", sec.image_size); println!("Load Address: {:#010x}", sec.load_address); println!("Image Flags: {:#010x}", sec.image_flags); println!("Page Descs: {}", sec.page_descriptors.len()); } if let Some(ref ffi) = header.file_format_info { println!("\n=== File Format ==="); println!("Encryption: {}", match ffi.encryption_type { 0 => "None", 1 => "Normal (AES)", _ => "Unknown" }); println!("Compression: {}", match ffi.compression_type { 0 => "None", 1 => "Basic", 2 => "Normal (LZX)", _ => "Unknown" }); if !ffi.basic_blocks.is_empty() { println!("Basic blocks: {}", ffi.basic_blocks.len()); } if ffi.normal_window_size != 0 { println!("LZX Window: {:#x}", ffi.normal_window_size); } } if let Some(ref name) = header.original_pe_name { println!("\nOriginal PE: {}", name); } if let Some(ref ei) = header.execution_info { println!("\n=== Execution Info ==="); println!("Title ID: {:#010x}", ei.title_id); println!("Media ID: {:#010x}", ei.media_id); println!("Disc: {} of {}", ei.disc_number, ei.disc_count); } if !header.import_libraries.is_empty() { println!("\n=== Import Libraries ==="); for lib in &header.import_libraries { println!(" {} (v{:#010x}, {} imports)", lib.name, lib.version_cur, lib.imports.len()); } } info!(wall_ms = started.elapsed().as_millis() as u64, "info complete"); Ok(()) } /// Clap parser for `--at` — accepts decimal, 0x-prefixed hex, or bare hex. fn parse_hex_u32(s: &str) -> Result { let t = s.trim(); let (digits, radix) = if let Some(rest) = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")) { (rest, 16) } else if t.chars().all(|c| c.is_ascii_digit()) { (t, 10) } else { (t, 16) }; u32::from_str_radix(digits, radix) .map_err(|e| format!("invalid u32 {:?}: {e} (try `0x824be9a0`)", t)) } #[instrument(skip_all, fields(path = %path, count))] fn cmd_disasm(path: &str, count: usize, at: Option) -> Result<()> { let started = Instant::now(); let data = load_xex_data(path)?; let header = xenia_xex::loader::parse_xex2_header(&data)?; let entry = xenia_xex::loader::get_entry_point(&header) .ok_or_else(|| anyhow::anyhow!("No entry point found in XEX2 header"))?; let base = xenia_xex::loader::get_image_base(&header) .ok_or_else(|| anyhow::anyhow!("No image base found in XEX2 header"))?; info!(entry = format_args!("{:#010x}", entry), base = format_args!("{:#010x}", base), "XEX entry/base"); let image_data = xenia_xex::loader::load_image(&data, &header)?; info!(bytes = image_data.len(), "image decompressed"); let start = at.unwrap_or(entry); let label = if at.is_some() { "requested address" } else { "entry point" }; println!("Disassembly from {} {:#010x} ({} instructions):\n", label, start, count); if start < base { return Err(anyhow::anyhow!( "address {:#x} is below image base {:#x}", start, base )); } let offset = (start - base) as usize; if offset + count * 4 > image_data.len() { return Err(anyhow::anyhow!( "address {:#x} (offset {:#x}) + {} instructions extends past image end ({:#x} bytes)", start, offset, count, image_data.len() )); } let block = xenia_cpu::disasm::disassemble_block(&image_data[offset..], start, count); for (addr, text) in block { println!(" {:#010x}: {}", addr, text); } info!(wall_ms = started.elapsed().as_millis() as u64, "disasm complete"); Ok(()) } #[instrument(skip_all, fields(path = %path, ui))] fn cmd_exec( path: &str, max_instructions: Option, ips_limit: Option, db_path: Option<&str>, trace_instructions: bool, trace_imports: bool, trace_branches: bool, quiet: bool, ui: bool, halt_on_deadlock: bool, trace_handles: bool, trace_handles_focus: Option<&str>, gpu_thread: bool, gpu_inline: bool, reservations_table: bool, parallel: bool, xaudio_tick: bool, ctor_probe: Option<&str>, dump_addr: Option<&str>, branch_probe: Option<&str>, mem_watch: Option<&str>, dump_section: Option<&str>, ) -> Result<()> { cmd_exec_inner( path, max_instructions, ips_limit, db_path, trace_instructions, trace_imports, trace_branches, quiet, ui, halt_on_deadlock, trace_handles, trace_handles_focus, gpu_thread, gpu_inline, reservations_table, parallel, xaudio_tick, ctor_probe, dump_addr, branch_probe, mem_watch, dump_section, None, None, false, ) } /// P8 — drive a deterministic run and emit a JSON digest of key counters. /// Shares `cmd_exec`'s setup path so the digest reflects a real execution. fn cmd_check( path: &str, max_instructions: u64, out: Option<&str>, expect: Option<&str>, stable_digest: bool, gpu_thread: bool, gpu_inline: bool, reservations_table: bool, parallel: bool, xaudio_tick: bool, ) -> Result<()> { cmd_exec_inner( path, Some(max_instructions), None, None, false, false, false, true, // quiet — suppress banners false, // ui — headless false, // halt_on_deadlock — keep the force-wake default for probe/check runs false, // trace_handles — diagnostic only, never wanted on golden runs None, // trace_handles_focus — same gpu_thread, gpu_inline, reservations_table, parallel, xaudio_tick, None, // ctor_probe — diagnostic, never wanted on goldens None, // dump_addr — same None, // branch_probe — diagnostic, never wanted on goldens None, // mem_watch — same None, // dump_section — same out, expect, stable_digest, ) } fn cmd_exec_inner( path: &str, max_instructions: Option, ips_limit: Option, db_path: Option<&str>, trace_instructions: bool, trace_imports: bool, trace_branches: bool, quiet: bool, ui: bool, halt_on_deadlock: bool, trace_handles: bool, trace_handles_focus: Option<&str>, gpu_thread: bool, gpu_inline: bool, reservations_table: bool, parallel: bool, xaudio_tick: bool, ctor_probe: Option<&str>, dump_addr: Option<&str>, branch_probe: Option<&str>, mem_watch: Option<&str>, dump_section: Option<&str>, digest_out: Option<&str>, digest_expect: Option<&str>, stable_digest: bool, ) -> Result<()> { let started = Instant::now(); let data = load_xex_data(path)?; let mut header = xenia_xex::loader::parse_xex2_header(&data)?; let entry = xenia_xex::loader::get_entry_point(&header) .ok_or_else(|| anyhow::anyhow!("No entry point found"))?; let base = xenia_xex::loader::get_image_base(&header) .ok_or_else(|| anyhow::anyhow!("No image base found"))?; if let Some(ref ffi) = header.file_format_info { info!( compression = match ffi.compression_type { 0 => "none", 1 => "basic", 2 => "normal (LZX)", _ => "unknown" }, encryption = match ffi.encryption_type { 0 => "none", 1 => "normal (AES)", _ => "unknown" }, "XEX file format" ); } for lib in &header.import_libraries { debug!(name = %lib.name, imports = lib.imports.len(), "import library"); } info!( entry = format_args!("{:#010x}", entry), base = format_args!("{:#010x}", base), "loading XEX" ); // Allocate guest memory let mut mem = xenia_memory::GuestMemory::new() .map_err(|e| anyhow::anyhow!("Failed to allocate guest memory: {}", e))?; // Load and decompress the XEX image let image_data = xenia_xex::loader::load_image(&data, &header)?; // Resolve import ordinals from PE image xenia_xex::loader::resolve_imports(&mut header, &image_data); let alloc_size = ((image_data.len() + 4095) & !4095) as u32; let rw = xenia_memory::page_table::MemoryProtect::READ | xenia_memory::page_table::MemoryProtect::WRITE; mem.alloc(base, alloc_size, rw) .map_err(|e| anyhow::anyhow!("Failed to allocate guest memory region: {}", e))?; mem.write_bulk(base, &image_data); // ── Phase 1: Build import thunk map ────────────────────────────────── let mut thunk_map: HashMap = HashMap::new(); // Reverse map for `XexGetProcedureAddress` (drained into kernel below). let mut thunk_addr_map: Vec<(ModuleId, u16, u32)> = Vec::new(); for lib in &header.import_libraries { let module = match lib.name.as_str() { "xboxkrnl.exe" => ModuleId::Xboxkrnl, "xam.xex" => ModuleId::Xam, _ => continue, }; for imp in &lib.imports { if imp.record_type == 1 { let name = xenia_analysis::resolve_ordinal(&lib.name, imp.ordinal) .map(|s| s.to_string()) .unwrap_or_else(|| format!("ordinal_{:#06X}", imp.ordinal)); thunk_map.insert(imp.address, (module, imp.ordinal, name)); thunk_addr_map.push((module, imp.ordinal, imp.address)); } } } info!(thunks = thunk_map.len(), "import thunks mapped"); // ── Phase 2: CPU initialization per xenia-canary ───────────────────── // Allocate stack (1MB at 0x70000000) let stack_base = 0x7000_0000u32; let stack_size = 0x10_0000u32; mem.alloc(stack_base, stack_size, rw) .map_err(|e| anyhow::anyhow!("Failed to allocate stack: {}", e))?; // Allocate PCR (Processor Control Region) and TLS let pcr_addr = 0x7FFF_0000u32; let tls_addr = 0x7FFE_0000u32; mem.alloc(pcr_addr, 0x1000, rw)?; mem.alloc(tls_addr, 0x1000, rw)?; // Initialize PCR structure mem.write_u32(pcr_addr, tls_addr); // PCR->tls_ptr mem.write_u32(pcr_addr + 0x100, 0x1000); // PCR->current_thread (fake) mem.write_u32(pcr_addr + 0x150, 0); // PCR->dpc_active // Set up CPU context per xenia-canary/cpu/thread_state.cc. // // `PpcContext::new` already initializes LR = 0xBCBC_BCBC (halt sentinel), // VSCR NJ bit, and VRSAVE = 0xFFFFFFFF to match canary. let mut ctx = xenia_cpu::PpcContext::new(); ctx.pc = entry; // Stack pointer: 16-byte aligned, leaving 256 bytes of reservation above // the top so the entry's prolog has room for its linkage area without // wandering off the allocated region. let sp_top = (stack_base + stack_size) as u64; ctx.gpr[1] = (sp_top - 0x100) & !0xFu64; ctx.gpr[2] = 0x2000_0000; // RTOC (TOC base) // r3..r7 carry the XEX "start_context" in canary; for a bare entry (no // launch data from the kernel), canary passes 0. Explicitly zero them so // we don't inherit whatever PpcContext::new happens to leave. for r in 3..=7 { ctx.gpr[r] = 0; } ctx.gpr[13] = pcr_addr as u64; // PCR/TLS pointer (r13) ctx.msr = 0x9030; // Hardware-dumped MSR // ── Phase 4: Set up kernel ─────────────────────────────────────────── // // M1.9 — default GPU backend is now **threaded**. Rollback rules: // 1. `--gpu-inline` / `XENIA_GPU_INLINE=1` → force inline. // 2. `--ui` → force inline (the UI // worker thread doesn't yet share `Arc` with the // GPU worker; cohabitation panics in `cmd_exec_inner` below). // 3. `--gpu-thread` / `XENIA_GPU_THREAD=1` → force threaded // (explicit override). // 4. Otherwise: threaded. let env_inline = std::env::var("XENIA_GPU_INLINE") .ok() .is_some_and(|v| { let v = v.trim().to_ascii_lowercase(); v == "1" || v == "true" || v == "yes" }); let env_thread = std::env::var("XENIA_GPU_THREAD") .ok() .is_some_and(|v| { let v = v.trim().to_ascii_lowercase(); v == "1" || v == "true" || v == "yes" }); let force_inline = gpu_inline || env_inline || ui; let force_thread = gpu_thread || env_thread; let use_threaded = if force_inline { false } else if force_thread { true } else { true // M1.9 default }; let shutdown_arc = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); let (mut maybe_gpu_worker, gpu_backend): ( Option, xenia_gpu::GpuBackend, ) = if use_threaded { let (worker, handle) = xenia_gpu::GpuSystem::new() .into_handle_with_shutdown(shutdown_arc.clone()); (Some(worker), xenia_gpu::GpuBackend::Threaded(handle)) } else { ( None, xenia_gpu::GpuBackend::Inline(xenia_gpu::GpuSystem::new()), ) }; let mut kernel = xenia_kernel::KernelState::with_gpu(gpu_backend); kernel.image_base = base; kernel.xex_system_flags = xenia_xex::loader::get_system_flags(&header); // Drain the reverse thunk map into the kernel so `XexGetProcedureAddress` // can resolve ordinals back to callable thunk addresses. for (module, ordinal, addr) in thunk_addr_map.drain(..) { kernel.register_thunk(module, ordinal, addr); } // M2.8 — flip the reservation-table flag if requested. Either // `--reservations-table` or `XENIA_RESERVATIONS_TABLE=1`. The flag // is observable via `kernel.reservations_enabled.load(Acquire)`; // M3 will hook it into the interpreter's `lwarx`/`stwcx.` arms. let reservations_via_env = std::env::var("XENIA_RESERVATIONS_TABLE") .ok() .is_some_and(|v| { let v = v.trim().to_ascii_lowercase(); v == "1" || v == "true" || v == "yes" }); let parallel_via_env = std::env::var("XENIA_PARALLEL") .ok() .is_some_and(|v| { let v = v.trim().to_ascii_lowercase(); v == "1" || v == "true" || v == "yes" }); let parallel_active = parallel || parallel_via_env; kernel.parallel_active = parallel_active; let xaudio_tick_via_env = std::env::var("XENIA_XAUDIO_TICK") .ok() .is_some_and(|v| { let v = v.trim().to_ascii_lowercase(); v == "1" || v == "true" || v == "yes" }); kernel.xaudio_tick_enabled = xaudio_tick || xaudio_tick_via_env; if kernel.xaudio_tick_enabled && !quiet { tracing::info!( "XAudio callback ticker enabled (--xaudio-tick / XENIA_XAUDIO_TICK=1)" ); } if reservations_table || reservations_via_env || parallel_active { kernel.reservations.enable(); if !quiet { tracing::info!("reservation table enabled (lwarx/stwcx route through it)"); } } // Diagnostic: flip audit on when --trace-handles or XENIA_TRACE_HANDLES. // Behaviour-neutral when disabled (every record_* method is a no-op). let audit_via_env = std::env::var("XENIA_TRACE_HANDLES") .map(|v| matches!(v.as_str(), "1" | "true" | "TRUE" | "True")) .unwrap_or(false); let trace_handles_active = trace_handles || audit_via_env || trace_handles_focus.is_some(); if trace_handles_active { kernel.audit.enabled = true; if !quiet { tracing::info!("handle audit enabled (--trace-handles)"); } } // KRNBUG-AUDIT-001: parse `--trace-handles-focus=0x1004,0x100c,...` // into the audit focus set. When non-empty, signal attempts on these // handles are captured into ghost trails even if no `record_create` // exists for them — the data-driven hypothesis fail-fast for // parked-waiter producer-trace. if let Some(list) = trace_handles_focus { for token in list.split(',').map(str::trim).filter(|s| !s.is_empty()) { let parsed = if let Some(hex) = token.strip_prefix("0x").or_else(|| token.strip_prefix("0X")) { u32::from_str_radix(hex, 16) } else { token.parse::() }; match parsed { Ok(h) => { kernel.audit.focus.insert(h); } Err(_) => { return Err(anyhow::anyhow!( "invalid handle in --trace-handles-focus: {token:?}" )); } } } if !quiet && !kernel.audit.focus.is_empty() { let focused: Vec = kernel .audit .focus .iter() .map(|h| format!("{h:#010x}")) .collect(); tracing::info!( "handle audit focus: {} ({})", kernel.audit.focus.len(), focused.join(", ") ); } } // Diagnostic. Parse `--ctor-probe=0x8217C850,0x...` (or // `XENIA_CTOR_PROBE=...`) into `kernel.ctor_probe_pcs`. The // worker prologue checks this set on every step; on a hit it // prints a single back-chain capture line. Empty set = no // probes = no-op fast path. let ctor_probe_combined: Option = match ( ctor_probe, std::env::var("XENIA_CTOR_PROBE").ok().or_else(|| std::env::var("XENIA_PC_PROBE").ok()), ) { (Some(s), _) => Some(s.to_string()), (None, Some(s)) if !s.is_empty() => Some(s), _ => None, }; if let Some(list) = ctor_probe_combined { for token in list.split(',').map(str::trim).filter(|s| !s.is_empty()) { let (pc_str, consumer): (&str, Option<(u32, u32)>) = match token.split_once('@') { None => (token, None), Some((pc_part, rest)) => { let (disp_str, off_str) = rest.split_once(':').ok_or_else(|| { anyhow::anyhow!( "invalid --pc-probe token {token:?}: expected PC@DISPATCHER:OFFSET" ) })?; let disp = parse_hex_u32(disp_str.trim()).map_err(|e| { anyhow::anyhow!("invalid dispatcher in {token:?}: {e}") })?; let off = parse_hex_u32(off_str.trim()).map_err(|e| { anyhow::anyhow!("invalid offset in {token:?}: {e}") })?; (pc_part.trim(), Some((disp, off))) } }; let pc = parse_hex_u32(pc_str).map_err(|e| { anyhow::anyhow!("invalid PC in --pc-probe: {token:?}: {e}") })?; kernel.ctor_probe_pcs.insert(pc); if let Some(c) = consumer { kernel.pc_probe_consumers.insert(pc, c); } } if !quiet && !kernel.ctor_probe_pcs.is_empty() { let pcs: Vec = kernel .ctor_probe_pcs .iter() .map(|p| format!("{p:#010x}")) .collect(); tracing::info!( "pc probes armed: {} ({}); consumer-snapshots: {}", kernel.ctor_probe_pcs.len(), pcs.join(", "), kernel.pc_probe_consumers.len(), ); } } let branch_probe_combined: Option = match ( branch_probe, std::env::var("XENIA_BRANCH_PROBE").ok(), ) { (Some(s), _) => Some(s.to_string()), (None, Some(s)) if !s.is_empty() => Some(s), _ => None, }; if let Some(list) = branch_probe_combined { for token in list.split(',').map(str::trim).filter(|s| !s.is_empty()) { let pc = parse_hex_u32(token).map_err(|e| { anyhow::anyhow!("invalid PC in --branch-probe: {token:?}: {e}") })?; kernel.branch_probe_pcs.insert(pc); } if !quiet && !kernel.branch_probe_pcs.is_empty() { let mut pcs: Vec = kernel.branch_probe_pcs.iter().copied().collect(); pcs.sort_unstable(); let strs: Vec = pcs.iter().map(|p| format!("{p:#010x}")).collect(); tracing::info!( "branch probes armed: {} ({})", kernel.branch_probe_pcs.len(), strs.join(", "), ); } } // Diagnostic. Parse `--dump-addr=0x828F3D08,...` (or // `XENIA_DUMP_ADDR=...`) into `kernel.dump_addrs`. The contents // are dumped at end-of-run by `dump_thread_diagnostic`. Pure // read; never mutates guest state. let dump_addr_combined: Option = match (dump_addr, std::env::var("XENIA_DUMP_ADDR").ok()) { (Some(s), _) => Some(s.to_string()), (None, Some(s)) if !s.is_empty() => Some(s), _ => None, }; if let Some(list) = dump_addr_combined { for token in list.split(',').map(str::trim).filter(|s| !s.is_empty()) { let parsed = if let Some(hex) = token.strip_prefix("0x").or_else(|| token.strip_prefix("0X")) { u32::from_str_radix(hex, 16) } else { token.parse::() }; match parsed { Ok(addr) => { kernel.dump_addrs.push(addr); } Err(_) => { return Err(anyhow::anyhow!( "invalid address in --dump-addr: {token:?}" )); } } } if !quiet && !kernel.dump_addrs.is_empty() { let addrs: Vec = kernel .dump_addrs .iter() .map(|a| format!("{a:#010x}")) .collect(); tracing::info!( "dump addresses armed: {} ({})", kernel.dump_addrs.len(), addrs.join(", ") ); } } let mem_watch_combined: Option = match (mem_watch, std::env::var("XENIA_MEM_WATCH").ok()) { (Some(s), _) => Some(s.to_string()), (None, Some(s)) if !s.is_empty() => Some(s), _ => None, }; let mut mem_watch_addrs: Vec = Vec::new(); if let Some(list) = mem_watch_combined { for token in list.split(',').map(str::trim).filter(|s| !s.is_empty()) { let parsed = if let Some(hex) = token.strip_prefix("0x").or_else(|| token.strip_prefix("0X")) { u32::from_str_radix(hex, 16) } else { token.parse::() }; match parsed { Ok(addr) => mem_watch_addrs.push(addr), Err(_) => { return Err(anyhow::anyhow!( "invalid address in --mem-watch: {token:?}" )); } } } if !quiet && !mem_watch_addrs.is_empty() { let strs: Vec = mem_watch_addrs .iter() .map(|a| format!("{a:#010x}")) .collect(); tracing::info!( "mem-watch armed: {} ({})", mem_watch_addrs.len(), strs.join(", ") ); } } let dump_section_combined: Option = match (dump_section, std::env::var("XENIA_DUMP_SECTION").ok()) { (Some(s), _) => Some(s.to_string()), (None, Some(s)) if !s.is_empty() => Some(s), _ => None, }; if let Some(spec) = dump_section_combined { let parts: Vec<&str> = spec.splitn(3, ':').collect(); if parts.len() != 3 { return Err(anyhow::anyhow!( "--dump-section expects BASE:LEN:PATH, got {spec:?}" )); } let parse_u32 = |t: &str| -> Result { if let Some(h) = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")) { u32::from_str_radix(h, 16).map_err(|e| anyhow::anyhow!("bad hex {t:?}: {e}")) } else { t.parse::().map_err(|e| anyhow::anyhow!("bad dec {t:?}: {e}")) } }; let base = parse_u32(parts[0].trim())?; let len = parse_u32(parts[1].trim())?; let path = std::path::PathBuf::from(parts[2].trim()); if !quiet { tracing::info!( "dump-section armed: base={:#010x} len={:#x} path={}", base, len, path.display() ); } kernel.dump_section = Some((base, len, path)); } // Install the GPU register aperture MMIO region on the guest memory so // any `0x7FC8xxxx` access routes to our atomic mailbox. Matches canary's // `graphics_system.cc:141-144`. The callbacks capture Arc clones of the // atoms that live inside `kernel.gpu.mmio`. mem.add_mmio_region(xenia_gpu::build_mmio_region(kernel.gpu.mmio())); // Install the initial guest thread on HW slot 0. The thread handle we // hand the scheduler isn't visible to any guest API yet, but joiners // (XThreadWait-style) will see it via `find_by_tid`. let main_handle = kernel.alloc_handle_for(xenia_kernel::objects::KernelObject::Thread { id: xenia_cpu::scheduler::INITIAL_GUEST_TID, hw_id: Some(0), exit_code: None, waiters: Vec::new(), }); kernel.next_thread_id.store( xenia_cpu::scheduler::INITIAL_GUEST_TID + 1, std::sync::atomic::Ordering::Relaxed, ); // PCR[+0x2C] = processor number 0 for the main HW thread. mem.write_u32(pcr_addr + 0x2C, 0); ctx.thread_id = xenia_cpu::scheduler::INITIAL_GUEST_TID; kernel.install_initial_thread( ctx, stack_base, stack_size, pcr_addr, tls_addr, main_handle, &mut mem, ); // If the input was a disc image, mount it so the kernel's file I/O // handlers can serve the game's own assets via VFS. if path.to_lowercase().ends_with(".iso") || path.to_lowercase().ends_with(".xiso") { match xenia_vfs::disc_image::DiscImageDevice::open("d", std::path::Path::new(path)) { Ok(disc) => kernel.vfs = Some(Box::new(disc)), Err(e) => tracing::warn!("Could not mount disc image for VFS: {}", e), } } // ── Phase 3: Data export patching (variable imports) ───────────────── // // Helper: lazily allocate a zero-filled block on the heap. Shared for all // diagnostic data exports the game reads but never writes — a single // block keeps allocator noise low. let alloc_zero = |size: u32, mem: &xenia_memory::GuestMemory, kernel: &mut xenia_kernel::KernelState| -> u32 { kernel.heap_alloc(size, mem).unwrap_or(0) }; for lib in &header.import_libraries { for imp in &lib.imports { if imp.record_type != 0 { continue; } // Only variable entries let addr = imp.address; match (lib.name.as_str(), imp.ordinal) { ("xboxkrnl.exe", 0x001B) => { // ExThreadObjectType — ptr to OBJECT_TYPE descriptor (0x40 bytes) let block = alloc_zero(0x40, &mut mem, &mut kernel); mem.write_u32(addr, block); } ("xboxkrnl.exe", 0x0059) => { // KeDebugMonitorData — ptr to diagnostic block let block = alloc_zero(0x40, &mut mem, &mut kernel); mem.write_u32(addr, block); } ("xboxkrnl.exe", 0x00AD) => { // KeTimeStampBundle — 0x18 block with FILETIME at +0 and // interrupt-time u64 at +0x10. Mirrors the clock used by // KeQuerySystemTime so fast-path readers see consistent values. let block = alloc_zero(0x18, &mut mem, &mut kernel); if block != 0 { let fake_time: u64 = 132_500_000_000_000_000; // ~2021 FILETIME mem.write_u32(block, (fake_time >> 32) as u32); mem.write_u32(block + 4, fake_time as u32); mem.write_u32(block + 0x10, (fake_time >> 32) as u32); mem.write_u32(block + 0x14, fake_time as u32); } mem.write_u32(addr, block); } ("xboxkrnl.exe", 0x0158) => { // XboxKrnlVersion: {major=2, minor=0, build=20000, qfe=0} mem.write_u16(addr, 2); mem.write_u16(addr + 2, 0); mem.write_u16(addr + 4, 20000); mem.write_u16(addr + 6, 0); } ("xboxkrnl.exe", 0x0193) => { // XexExecutableModuleHandle -> image base mem.write_u32(addr, base); } ("xboxkrnl.exe", 0x01AE) => { // ExLoadedCommandLine — ANSI empty string let block = alloc_zero(0x10, &mut mem, &mut kernel); // Block is already zero-initialized by heap_alloc -> empty string. mem.write_u32(addr, block); } ("xboxkrnl.exe", 0x01BE) => { // VdGlobalDevice — passed through to Vd* shims. Write 0. mem.write_u32(addr, 0); } ("xboxkrnl.exe", 0x01C0) => { // VdGpuClockInMHz mem.write_u32(addr, 500); } ("xboxkrnl.exe", 0x01C1) => { // VdHSIOCalibrationLock — lock slot mem.write_u32(addr, 0); } ("xboxkrnl.exe", 0x0266) => { // KeCertMonitorData — ptr to diagnostic block let block = alloc_zero(0x100, &mut mem, &mut kernel); mem.write_u32(addr, block); } _ => { // All other variable exports: write 0 mem.write_u32(addr, 0); } } } } // ── Phase 5: Set up SQLite DB with full static analysis + opt-in traces ── let mut db_writer: Option = None; if let Some(db) = db_path { use std::collections::HashMap; let sections = xenia_xex::pe::parse_sections(&image_data)?; // Build import address -> name map (for xref analysis) let mut import_map: HashMap = HashMap::new(); for lib in &header.import_libraries { for imp in &lib.imports { let resolved = xenia_analysis::resolve_ordinal(&lib.name, imp.ordinal); let name = match resolved { Some(n) => format!("{}::{}", lib.name, n), None => format!("{}::ordinal_{:#06X}", lib.name, imp.ordinal), }; import_map.insert(imp.address, name); } } // Function + xref analysis let code_sections: Vec<(u32, u32, u32)> = sections.iter() .filter(|s| s.is_code()) .map(|s| (s.virtual_address, s.virtual_size, s.flags)) .collect(); let func_analysis = xenia_analysis::func::analyze(&image_data, base, entry, &code_sections); info!(functions = func_analysis.functions.len(), "function detection complete"); let xref_result = xenia_analysis::xref::analyze_xrefs( &image_data, base, entry, §ions, &func_analysis, &import_map, ); let total_xrefs: usize = xref_result.xrefs.values().map(|v| v.len()).sum(); info!( labels = xref_result.labels.len(), xrefs = total_xrefs, "xref analysis complete" ); let disasm_info = xenia_analysis::formatter::DisasmInfo { image_base: base, entry_point: entry, original_pe_name: header.original_pe_name.as_deref(), title_id: header.execution_info.as_ref().map(|e| e.title_id), media_id: header.execution_info.as_ref().map(|e| e.media_id), sections: §ions, import_libraries: &header.import_libraries, }; info!(db = %db, "writing database"); let mut w = xenia_analysis::DbWriter::open_fresh(std::path::Path::new(db))?; w.write_base(&disasm_info)?; w.write_disasm(&image_data, &disasm_info, &func_analysis, &xref_result.labels, &xref_result.xrefs)?; w.prepare_trace_tables(trace_instructions, trace_imports, trace_branches)?; db_writer = Some(w); } // Set up debugger. // // Tier-3 perf: `trace_enabled = false` by default. The in-memory // `trace_log: Vec` was previously populated on every // instruction; once it filled to 100k entries each subsequent step // performed a `Vec::remove(0)` — O(n) shifts per instruction. At // 2M+ instructions this pathological behavior dominated run time // (~300s of the ~600s we observed for 5M instructions, consistent // with ~10 GB/s memory bandwidth × n²/2 shift cost). // // Nothing outside the debugger REPL (not shipped) reads the log, so // the default is safe to flip. `--trace-instructions` routes full // instruction traces to the DB writer instead, which is the correct // path for analysis; we keep that behaviour unchanged. let mut debugger = xenia_debugger::Debugger::new(); debugger.paused = false; debugger.step_mode = xenia_debugger::StepMode::Run; debugger.trace_enabled = false; // DB-based trace still needs the in-memory trace off — we only want // one authoritative stream. `trace_instructions` implies the DB // writer pipeline is the recorder; leave `trace_enabled` false. let _ = trace_instructions; match max_instructions { Some(n) => info!(limit = n, "starting execution"), None => info!("starting execution (unlimited)"), } if let Some(ips) = ips_limit { info!(ips_limit = ips, "throttling execution to instructions/second"); } // ── Phase 6: Execution ─────────────────────────────────────────────── // Either run on the main thread (headless), or spawn a worker and drive // the winit event loop on the main thread (--ui). // M1.4 — wrap `mem` in an `Arc` after all init mutations // are complete. The worker thread (if spawned below) holds its own if !mem_watch_addrs.is_empty() { mem.arm_mem_watch(mem_watch_addrs); } // Arc clone for the duration of the run; the CPU side passes // `&*mem_arc` (= `&GuestMemory`) into `run_execution`. The // trait-level invariant carrying this is correctness: writes are // `&self` post the M1.4(b) trait flip, and `bump_page_version` is // atomic, so concurrent CPU/GPU accesses to disjoint byte ranges are // sound. (Same-byte concurrent access remains the caller's // responsibility per the trait contract.) let mem_arc = std::sync::Arc::new(mem); // Spawn the real GPU worker if the threaded backend was chosen at // kernel-construction time. The handle the kernel already holds // (`GpuBackend::Threaded`) is the CPU-side proxy; the worker owns // the actual `GpuSystem` and runs its `run()` loop — see // `xenia_gpu::handle::GpuWorker::run` for the concurrency model. // M1.3's `spawn_noop_worker` is now superseded for the threaded // path; the no-op helper is retained for unit tests. let gpu_thread_resources = if let Some(worker) = maybe_gpu_worker.take() { info!("gpu: threaded backend — spawning worker thread"); let join = xenia_gpu::spawn_gpu_worker(worker, mem_arc.clone()); Some((shutdown_arc.clone(), join)) } else { None }; // M1.7: capture the worker-thread handle before the dispatch so the // shutdown path can unpark it without holding `kernel` (the UI path // moves `kernel` into `run_with_ui`). In headless+threaded mode the // kernel survives the dispatch, but extracting here also covers the // hypothetical UI+threaded mix once that ergonomic landmine is // resolved. let worker_thread_handle = if gpu_thread_resources.is_some() { Some(kernel.gpu.mmio().worker_thread.clone()) } else { None }; let result = if ui { run_with_ui( path, // `run_with_ui` consumes `GuestMemory` by value today; M1.4 // keeps that path on the inline backend until the UI worker // is migrated to the Arc-shared model. Recover ownership via // `Arc::try_unwrap` — succeeds because the GPU worker is not // spawned in inline mode (`maybe_gpu_worker` is `None`). std::sync::Arc::try_unwrap(mem_arc).unwrap_or_else(|_| { panic!( "M1.4: --ui + --gpu-thread cohabitation not yet wired; \ choose one" ) }), kernel, debugger, thunk_map, db_writer, max_instructions, ips_limit, quiet, halt_on_deadlock, started, ) } else { // Wrap the headless body in a closure so any early returns // (digest mismatch, db finalize error) flow through the GPU // thread shutdown below rather than leaking the worker. (|| -> Result<()> { // M3 — opt-in `--parallel` flag spawns the interpreter on a // dedicated worker thread under `Arc>`. // The first M3 milestone delivers the spawn substrate (N=1 // worker, kernel wrapped in Arc>); the actual // per-HW-thread parallel spawn is the follow-up. With N=1 // and the worker holding the lock for the run's duration, // observable execution is bit-identical to lockstep mode — // golden must still match. let env_parallel = std::env::var("XENIA_PARALLEL") .ok() .is_some_and(|v| { let v = v.trim().to_ascii_lowercase(); v == "1" || v == "true" || v == "yes" }); let do_parallel = parallel || env_parallel; // Step 04 gate: --parallel runs N=6 workers that release // the kernel mutex around step_block, so the per-instruction // observation path (debugger hooks, DB writer, force-per-instr) // would either contend severely or deadlock. Reject the // combination at the boundary with a clean error. if do_parallel { if debugger.wants_hooks() { anyhow::bail!( "--parallel is incompatible with debugger hooks; use lockstep mode" ); } if db_writer.is_some() { anyhow::bail!( "--parallel is incompatible with --db / DB writer; use lockstep mode" ); } if std::env::var("XENIA_FORCE_PER_INSTR") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false) { anyhow::bail!( "--parallel is incompatible with XENIA_FORCE_PER_INSTR=1; use lockstep mode" ); } } let stats = if do_parallel { if !quiet { info!( "M3 — spawning N=6 worker threads + main coordinator (7-party phaser)", ); } let kernel_arc = std::sync::Arc::new(std::sync::Mutex::new(kernel)); let mem_for_worker = mem_arc.clone(); let kernel_for_worker = kernel_arc.clone(); // Move debugger + db_writer + thunk_map into the worker; // the worker returns them so the caller can resume // post-run analysis (digest, summary, diagnostic dump). let max_instructions_w = max_instructions; let ips_limit_w = ips_limit; let quiet_w = quiet; let halt_w = halt_on_deadlock; let mut debugger_w = debugger; let mut db_writer_w = db_writer; let thunk_map_w = thunk_map; let join = std::thread::Builder::new() .name("xenia-cpu-host".into()) .spawn(move || -> ( ExecStats, xenia_debugger::Debugger, Option, std::collections::HashMap, ) { // M3 real-par Step 03: per-round drop-and-reacquire // around step_block. Worker no longer holds the // mutex for the run's duration; `run_execution_parallel` // owns the locking dance internally. let stats = run_execution_parallel( &*mem_for_worker, &kernel_for_worker, &mut debugger_w, &thunk_map_w, db_writer_w.as_mut(), max_instructions_w, ips_limit_w, quiet_w, halt_w, None, ); (stats, debugger_w, db_writer_w, thunk_map_w) }) .expect("spawn xenia-cpu-host"); let (stats, debugger_back, db_writer_back, thunk_map_back) = join.join().expect("xenia-cpu-host worker panicked"); debugger = debugger_back; db_writer = db_writer_back; let _ = thunk_map_back; // keep alive via shadowing already handled // Recover the kernel from the Arc>. The worker // dropped its clone on return; only one strong ref // remains, so `try_unwrap` succeeds. kernel = std::sync::Arc::try_unwrap(kernel_arc) .map_err(|_| anyhow::anyhow!("M3: kernel Arc still has clones"))? .into_inner() .map_err(|_| anyhow::anyhow!("M3: kernel mutex poisoned"))?; stats } else { run_execution( &*mem_arc, &mut kernel, &mut debugger, &thunk_map, db_writer.as_mut(), max_instructions, ips_limit, quiet, halt_on_deadlock, None, ) }; // After the conditional bound `stats`, `kernel`, // `debugger`, `db_writer` above, post-run analysis continues // unchanged: db finalize, print_summary, dump_thread_diagnostic, // run digest emit + golden compare. if let Some(ref mut db) = db_writer { db.finalize_traces()?; } print_summary(kernel.scheduler.ctx(0), &debugger, &db_writer, quiet); dump_thread_diagnostic(&kernel, &*mem_arc, quiet); info!( wall_ms = started.elapsed().as_millis() as u64, instructions = stats.instruction_count, import_calls = stats.import_count, unimplemented = stats.unimpl_count, "exec complete" ); // P8 — optional run-digest emission. Built from the live // `kernel.gpu.stats` + `ExecStats` after the run completes; // stable ordering + stable field names so byte-level JSON diffs // catch any drift between runs. if digest_out.is_some() || digest_expect.is_some() { let digest = RunDigest::capture(path, &kernel, &stats); let json = if stable_digest { digest.stable_fields_json() } else { digest.to_json() }; if let Some(out_path) = digest_out { std::fs::write(out_path, &json)?; info!(out = out_path, stable = stable_digest, "run digest written"); } else { println!("{json}"); } if let Some(expect_path) = digest_expect { let expected = std::fs::read_to_string(expect_path) .map_err(|e| anyhow::anyhow!("failed to read expected digest: {e}"))?; if json.trim() != expected.trim() { eprintln!("DIGEST MISMATCH vs {expect_path}"); eprintln!("--- expected ---"); eprintln!("{}", expected.trim()); eprintln!("--- actual ---"); eprintln!("{}", json.trim()); return Err(anyhow::anyhow!( "run digest differs from golden; see diff above" )); } info!("run digest matches golden"); } } Ok(()) })() }; // M1.7 — signal the GPU worker thread to exit and join it with a // 1 s defensive timeout. We also `unpark()` it via the MMIO // worker-thread handle so it observes shutdown immediately rather // than waiting up to `WORKER_PARK_TIMEOUT` (16 ms) on its own. The // helper logs at `error!` if the join blows the budget; we emit a // `warn!` here as a second-level breadcrumb tied to this specific // run. We do NOT propagate the timeout as an error: process // teardown is more important than blocking on a stuck thread. if let Some((shutdown, join)) = gpu_thread_resources { // Wake the parker so the worker observes `shutdown` on its next // loop iteration without waiting for `park_timeout`. if let Some(handle_arc) = worker_thread_handle { if let Ok(g) = handle_arc.lock() { if let Some(t) = g.as_ref() { t.unpark(); } } } if xenia_gpu::shutdown_and_join_with_timeout( &shutdown, join, std::time::Duration::from_secs(1), ) .is_err() { warn!("GPU worker thread did not exit in 1s; continuing teardown"); } } result } /// Result of one execution pass — used by both the headless path and the /// worker thread behind `--ui`. #[derive(Default)] struct ExecStats { instruction_count: u64, import_count: u64, unimpl_count: u64, } /// P8 — deterministic run digest. Emitted as stable-ordered JSON by /// `xenia-rs check`. Every field is a u64 counter; the file is trivially /// diffable via `diff` / `cmp`. New counters are appended at the bottom /// so golden files authored today still match tomorrow's schema minus /// any new lines. Semver-wise this is a "pre-1.0" API — regressions bail /// on first mismatch, no field-wise tolerance. struct RunDigest { path: String, instructions: u64, imports: u64, unimpl: u64, packets: u64, draws: u64, swaps: u64, resolves: u64, unique_render_targets: u64, shader_blobs_live: u64, interrupts_delivered: u64, interrupts_dropped: u64, texture_cache_entries: u64, texture_decodes: u64, } impl RunDigest { fn capture(path: &str, kernel: &xenia_kernel::KernelState, stats: &ExecStats) -> Self { // M1.4: route stats through the backend snapshot accessor. Inline // mode reads directly from the live `GpuSystem`; threaded mode // reads from the worker's published `Arc>` // (refreshed each outer loop iteration on the worker thread). let snap = kernel.gpu.digest_snapshot(); Self { path: path.to_string(), instructions: stats.instruction_count, imports: stats.import_count, unimpl: stats.unimpl_count, packets: snap.stats.packets_executed, draws: snap.stats.draws_seen, swaps: snap.stats.swaps_seen, resolves: snap.stats.resolves_total, unique_render_targets: snap.stats.unique_render_targets, shader_blobs_live: snap.shader_blobs_live, interrupts_delivered: kernel.interrupts.delivered, interrupts_dropped: kernel.interrupts.dropped, texture_cache_entries: snap.texture_cache_entries, texture_decodes: snap.texture_decodes, } } fn to_json(&self) -> String { // Hand-rolled deterministic ordering — alphabetical-ish + keys // quoted. No external dep, no field ambiguity. format!( "{{\n \"path\": \"{}\",\n \ \"instructions\": {},\n \ \"imports\": {},\n \ \"unimpl\": {},\n \ \"packets\": {},\n \ \"draws\": {},\n \ \"swaps\": {},\n \ \"resolves\": {},\n \ \"unique_render_targets\": {},\n \ \"shader_blobs_live\": {},\n \ \"interrupts_delivered\": {},\n \ \"interrupts_dropped\": {},\n \ \"texture_cache_entries\": {},\n \ \"texture_decodes\": {}\n}}\n", escape_json_string(&self.path), self.instructions, self.imports, self.unimpl, self.packets, self.draws, self.swaps, self.resolves, self.unique_render_targets, self.shader_blobs_live, self.interrupts_delivered, self.interrupts_dropped, self.texture_cache_entries, self.texture_decodes, ) } /// Stable-fields-only digest for goldens that need to survive non-determinism. /// Excludes timing-sensitive counters: `packets` has documented ±2.5–8% lockstep /// noise from a GPU thread race; `resolves`, `interrupts_delivered`, /// `interrupts_dropped`, and `texture_decodes` are scheduling-sensitive under /// `--parallel`. Also omits `path` (cwd-dependent). The remaining fields are /// deterministic in lockstep at a fixed instruction budget. Use via /// `--stable-digest`. fn stable_fields_json(&self) -> String { format!( "{{\n \"instructions\": {},\n \ \"imports\": {},\n \ \"unimpl\": {},\n \ \"draws\": {},\n \ \"swaps\": {},\n \ \"unique_render_targets\": {},\n \ \"shader_blobs_live\": {},\n \ \"texture_cache_entries\": {}\n}}\n", self.instructions, self.imports, self.unimpl, self.draws, self.swaps, self.unique_render_targets, self.shader_blobs_live, self.texture_cache_entries, ) } } fn escape_json_string(s: &str) -> String { s.replace('\\', r"\\").replace('"', r#"\""#) } /// Round-control signal returned by the per-round coordinator helpers. /// `BreakOuter` halts the scheduler loop; `Continue` resumes the outer /// loop (either to step the round or to retry after an idle-advance). #[derive(Debug)] enum RoundCtl { BreakOuter, Continue, } /// Per-round prologue: max-instruction budget check, IPS throttle, UI /// shutdown poll, heartbeat log, vsync ticker (with d1mode bit set when /// the vsync fires), pending-timer fire, graphics-interrupt injection. /// Returns `BreakOuter` only when the run hits its budget or the UI /// asks for shutdown. fn coord_pre_round( kernel: &mut xenia_kernel::KernelState, stats: &ExecStats, max_instructions: Option, ips_limit: Option, throttle_start: Instant, shutdown: &Option>, ) -> RoundCtl { const SHUTDOWN_CHECK_MASK: u64 = 0x1FFF; const HEARTBEAT_MASK: u64 = 0xF_FFFF; // ~1M instructions if let Some(limit) = max_instructions && stats.instruction_count >= limit { info!(limit, "reached max instruction count"); return RoundCtl::BreakOuter; } if let Some(ips) = ips_limit && ips > 0 && stats.instruction_count > 0 { let expected_ns = (stats.instruction_count as u128) .saturating_mul(1_000_000_000u128) / ips as u128; let actual_ns = throttle_start.elapsed().as_nanos(); if actual_ns < expected_ns { let sleep_ns = (expected_ns - actual_ns).min(u64::MAX as u128) as u64; std::thread::sleep(std::time::Duration::from_nanos(sleep_ns)); } } if (stats.instruction_count & SHUTDOWN_CHECK_MASK) == 0 { if let Some(flag) = shutdown && flag.load(std::sync::atomic::Ordering::Relaxed) { info!(instructions = stats.instruction_count, "UI requested shutdown"); return RoundCtl::BreakOuter; } if let Some(ref ui) = kernel.ui { ui.instructions_counter .store(stats.instruction_count, std::sync::atomic::Ordering::Relaxed); } } if (stats.instruction_count & HEARTBEAT_MASK) == 0 && stats.instruction_count > 0 && tracing::enabled!(tracing::Level::DEBUG) { let t = kernel.scheduler.ctx(0); debug!( cycle = stats.instruction_count, pc = format_args!("{:#010x}", t.pc), lr = format_args!("{:#010x}", t.lr as u32), sp = format_args!("{:#010x}", t.gpr[1] as u32), "cpu heartbeat" ); } // KRNBUG-D08: backend-aware v-sync ticker. // // **Lockstep**: instruction-count ticker (deterministic; one tick per // PPC block boundary, predictable cadence). The cadence drifts a bit // from real 60 Hz but is bit-stable across runs, which matters for // the `sylpheed_n*m.json` golden oracles. // // **--parallel**: wall-clock ticker. The instruction-count proxy // dropped from 629 v-syncs/100M lockstep to ~2 under `--parallel` // (audit M11) because the dispatcher executes more PPC instructions // per tick callback when 6 worker threads share the kernel mutex, // so the accumulator never crosses the 150k threshold. Wall-clock // restores the ~60 Hz rate at the cost of bit-exact run reproducibility, // which is acceptable under `--parallel` (M11 already documented // `--parallel` as non-deterministic by design). let fired = if kernel.parallel_active { kernel.interrupts.tick_vsync_wallclock() } else { kernel.interrupts.tick_vsync_instr(stats.instruction_count) }; if fired { use std::sync::atomic::Ordering; let mmio = kernel.gpu.mmio(); let prev = mmio.d1mode_vblank_vline_status.load(Ordering::Relaxed); mmio.d1mode_vblank_vline_status .store(prev | 0x1, Ordering::Relaxed); } if kernel.xaudio_tick_enabled { if kernel.parallel_active { kernel.xaudio.tick_wallclock(); } else { kernel.xaudio.tick_instr(stats.instruction_count); } } kernel.fire_due_timers(); try_inject_graphics_interrupt(kernel); if kernel.xaudio_tick_enabled { try_inject_audio_callback(kernel); } RoundCtl::Continue } /// Per-round idle-advance: invoked when `round_schedule()` returns no /// runnable threads. Advances time to the earliest pending timer or /// timed-wait deadline, fires due timers, and handles deadline wakes. /// On hard deadlock either halts (with diagnostics) or force-wakes /// blocked waiters with `STATUS_TIMEOUT`. `Continue` restarts the /// outer loop; `BreakOuter` halts the scheduler. fn coord_idle_advance( kernel: &mut xenia_kernel::KernelState, halt_on_deadlock: bool, shutdown: &Option>, stats: &ExecStats, ) -> RoundCtl { let next_timer = kernel.earliest_timer_deadline(); let next_wait = kernel.scheduler.earliest_wait_deadline(); let target = match (next_timer, next_wait) { (Some(a), Some(b)) => Some(a.min(b)), (Some(a), None) => Some(a), (None, Some(b)) => Some(b), (None, None) => None, }; if let Some(target) = target { kernel.scheduler.advance_all_timebases_to(target); kernel.fire_due_timers(); if let Some((r, reason)) = kernel.scheduler.advance_to_next_wake_if_due(target) { kernel.handle_timeout_wake(r, reason); } return RoundCtl::Continue; } if kernel.scheduler.has_live_thread() { if halt_on_deadlock { tracing::warn!( instructions = stats.instruction_count, "scheduler deadlock: halting (--halt-on-deadlock); dumping per-thread diagnostic" ); for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() { if slot.runqueue.is_empty() { tracing::warn!(hw = hw_id, state = "empty", "deadlock slot"); continue; } for (idx, t) in slot.runqueue.iter().enumerate() { tracing::warn!( hw = hw_id, idx = idx, tid = t.tid, state = ?t.state, pc = format_args!("{:#010x}", t.ctx.pc), lr = format_args!("{:#010x}", t.ctx.lr as u32), sp = format_args!("{:#010x}", t.ctx.gpr[1] as u32), "deadlock thread" ); } } metrics::counter!("scheduler.deadlock_halts").increment(1); if let Some(flag) = shutdown { flag.store(true, std::sync::atomic::Ordering::Relaxed); } return RoundCtl::BreakOuter; } tracing::warn!( snapshot = ?kernel.scheduler.diagnostic_snapshot(), "scheduler deadlock: force-waking blocked waiters with STATUS_TIMEOUT" ); let woken = kernel.scheduler.unblock_on_deadlock(); // STATUS_TIMEOUT = 0x00000102 per NT conventions. const STATUS_TIMEOUT: u64 = 0x0000_0102; for r in &woken { kernel.scheduler.ctx_mut_ref(*r).gpr[3] = STATUS_TIMEOUT; } metrics::counter!("scheduler.deadlock_recoveries").increment(1); if woken.is_empty() { info!("scheduler: deadlock recovery woke zero threads — halting"); return RoundCtl::BreakOuter; } return RoundCtl::Continue; } info!("scheduler: no live HW threads remain — halting"); RoundCtl::BreakOuter } /// Per-round epilogue: clear the slot visit, drive the inline GPU /// proportional to instructions executed in this round, drain any /// GPU-side pending interrupts, and check liveness. Returns /// `BreakOuter` when no live HW threads remain. fn coord_post_round( kernel: &mut xenia_kernel::KernelState, mem: &xenia_memory::GuestMemory, stats: &ExecStats, instrs_at_round_start: u64, ) -> RoundCtl { kernel.scheduler.end_slot_visit(); let executed_this_round = stats .instruction_count .saturating_sub(instrs_at_round_start); let mut gpu_runs = (executed_this_round / xenia_cpu::scheduler::HW_THREAD_COUNT as u64) .max(1); if gpu_runs > 64 { gpu_runs = 64; } if let Some(gpu) = kernel.gpu.as_inline_mut() { gpu.sync_with_mmio(); for _ in 0..gpu_runs { if gpu.is_ready(mem) { let _ = gpu.execute_one(mem); gpu.sync_with_mmio(); } else { break; } } } else { let _ = gpu_runs; } if kernel.gpu.has_pending_interrupts() { for _pi in kernel.gpu.take_pending_interrupts() { kernel .interrupts .queue_interrupt(xenia_kernel::INTERRUPT_SOURCE_CP); } } if !kernel.scheduler.has_live_thread() { info!("scheduler: all threads exited — halting"); return RoundCtl::BreakOuter; } RoundCtl::Continue } /// Per-worker state for one HW slot. Owns its block cache + decode /// cache so that under per-HW-thread parallelism (Step 04) the worker /// thread can step its slot without sharing mutable cache state with /// peers. In lockstep mode this is just six independent caches on the /// same host thread — observable behavior identical, slightly higher /// per-slot fill churn. struct WorkerCtx { hw_id: u8, block_cache: xenia_cpu::block_cache::BlockCache, decode_cache: xenia_cpu::decoder::DecodeCache, force_per_instr: bool, } impl WorkerCtx { fn new(hw_id: u8, force_per_instr: bool) -> Self { Self { hw_id, block_cache: xenia_cpu::block_cache::BlockCache::new(), decode_cache: xenia_cpu::decoder::DecodeCache::new(), force_per_instr, } } } /// Outcome of `worker_prologue`. Either the slot was fully handled /// inline (halt-sentinel, thunk dispatch, per-instruction step), or /// the caller must run a block-cache `step_block` and then call /// `worker_epilogue`, or the outer scheduler loop must break (fault, /// debugger break, main-thread halt). enum PrologueOutcome { Continue, BreakOuter, StepBlock { tid: Option, thread_ref: xenia_cpu::ThreadRef, block_ptr: *const xenia_cpu::block_cache::DecodedBlock, pc_before: u32, }, } enum SlotOutcome { Continue, BreakOuter, } /// Per-slot prologue. Performs `begin_slot_visit`, halt-sentinel /// detection + restore, import-thunk dispatch (under the kernel lock), /// unmapped-PC fault check, and either kicks off a block-cache step /// (returning `StepBlock`) or runs the per-instruction observation /// path inline (debugger / DB writer). When this returns /// `PrologueOutcome::Continue` the caller skips to the next slot; /// `BreakOuter` exits the scheduler loop; `StepBlock` hands off to a /// `step_block(ctx, mem, &*block_ptr)` call followed by /// `worker_epilogue`. fn worker_prologue( wc: &mut WorkerCtx, kernel: &mut xenia_kernel::KernelState, mem: &xenia_memory::GuestMemory, debugger: &mut xenia_debugger::Debugger, db_writer: &mut Option<&mut xenia_analysis::DbWriter>, thunk_map: &HashMap, stats: &mut ExecStats, ) -> PrologueOutcome { use xenia_cpu::interpreter::{step_cached, StepResult}; use xenia_cpu::scheduler::{HwState, INITIAL_GUEST_TID}; use xenia_cpu::PpcOpcode; const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32; let hw_id = wc.hw_id; kernel.scheduler.begin_slot_visit(hw_id); // Under per-HW-thread parallelism (run_execution_parallel) the // coordinator's runnable-mask snapshot may be stale by the time // this worker picks up the lock — a peer worker's call_export // could have blocked the slot's last Ready thread. begin_slot_visit // then sets running_idx = None and ctx(hw_id) returns the idle // sentinel (pc = 0). Without this short-circuit the prologue would // fall through to the unmapped-PC fault and break the outer loop. if kernel.scheduler.current.is_none() { return PrologueOutcome::Continue; } let pc = kernel.scheduler.ctx(hw_id).pc; // 0) Diagnostic ctor-probe: if `pc` is in // `kernel.ctor_probe_pcs`, capture live r3/lr/sp + back-chain // and println one record. Read-only; lockstep digest unaffected. // Empty set is the common case → single `is_empty()` test inside // the helper, no overhead on the hot path. kernel.fire_ctor_probe_if_match(hw_id, mem); kernel.fire_branch_probe_if_match(hw_id); if mem.has_mem_watch() { let ctx = kernel.scheduler.ctx(hw_id); let tid = kernel.scheduler.tid(hw_id).unwrap_or(0); xenia_memory::set_writer_ctx(tid, ctx.pc, ctx.lr as u32); } // 1) Halt-sentinel check (per HW thread). if pc == LR_HALT { let injected_here = kernel.interrupts.saved.is_some() && kernel .interrupts .injected_ref .map(|r| r.hw_id == hw_id) == Some(true); if injected_here && let Some(saved) = kernel.interrupts.saved.take() { let target_ref = kernel .interrupts .injected_ref .expect("injected_here guard"); saved.restore(kernel.scheduler.ctx_mut_ref(target_ref)); kernel.interrupts.delivered += 1; let source = saved.source; let mut restore_outcome = "ready"; let current = kernel.scheduler.thread(target_ref).state.clone(); if let HwState::ServicingIrq(reason) = current { kernel.scheduler.thread_mut(target_ref).state = HwState::Blocked(reason); restore_outcome = "reblocked"; } tracing::debug!( source, hw_id, outcome = restore_outcome, "graphics interrupt: callback returned" ); return PrologueOutcome::Continue; } let tid = kernel.scheduler.tid(hw_id); tracing::info!( hw_id, ?tid, is_main = tid == Some(INITIAL_GUEST_TID), cycle = stats.instruction_count, "HW thread returned to LR sentinel — marking exited" ); let (_, _exited_tid, handle_opt) = kernel.scheduler.exit_current(0); if let Some(h) = handle_opt && let Some(xenia_kernel::objects::KernelObject::Thread { exit_code, waiters, .. }) = kernel.objects.get_mut(&h) { *exit_code = Some(0); let to_wake: Vec = std::mem::take(waiters); for w in to_wake { kernel.scheduler.wake_ref(w); } } return PrologueOutcome::Continue; } // 2) Import thunk intercept. if let Some((module, ordinal, name)) = thunk_map.get(&pc) { let module = *module; let ordinal_u32 = *ordinal as u32; let thunk_pc = pc; let args = { let c = kernel.scheduler.ctx(hw_id); [c.gpr[3], c.gpr[4], c.gpr[5], c.gpr[6]] }; kernel.call_export(module, ordinal_u32, mem); let post_ref = kernel.scheduler.current; let c = match post_ref { Some(r) => kernel.scheduler.ctx_mut_ref(r), None => kernel.scheduler.ctx_mut(hw_id), }; let return_value = c.gpr[3]; c.pc = c.lr as u32; c.cycle_count += 1; c.timebase += 1; if let Some(db) = db_writer.as_deref_mut() { db.log_import_call(xenia_analysis::ImportCallEntry { address: thunk_pc, cycle: c.cycle_count, module: match module { ModuleId::Xboxkrnl => "xboxkrnl.exe".to_string(), ModuleId::Xam => "xam.xex".to_string(), ModuleId::Xbdm => "xbdm.xex".to_string(), }, ordinal: *ordinal, name: name.clone(), arg_r3: args[0], arg_r4: args[1], arg_r5: args[2], arg_r6: args[3], return_value, }); } stats.instruction_count += 1; stats.import_count += 1; kernel.scheduler.decrement_quantum(); return PrologueOutcome::Continue; } // 3) Unmapped PC. if !mem.is_mapped(pc) { tracing::error!( cycle = stats.instruction_count, pc = format_args!("{:#010x}", pc), hw_id, "FAULT: PC in unmapped memory" ); return PrologueOutcome::BreakOuter; } // 4) Decide block-cache fast path vs per-instruction observation. let hooks = debugger.wants_hooks(); let observe_per_instruction = hooks || db_writer.is_some() || wc.force_per_instr; if !observe_per_instruction { // Block-cache fast path: look up the block under the kernel // lock and hand it to the caller via raw pointer. The pointer // is sound for the duration of this slot's step_block call // because `wc.block_cache` is owned by this worker (no peer // mutation possible) and the cache slot is not touched again // until the next prologue iteration. let tid = kernel.scheduler.tid(hw_id); let thread_ref = kernel .scheduler .current .expect("begin_slot_visit set scheduler.current to Some when slot has runnable thread"); let block_ptr: *const xenia_cpu::block_cache::DecodedBlock = { let pc_for_lookup = kernel.scheduler.ctx(hw_id).pc; let b: &xenia_cpu::block_cache::DecodedBlock = wc.block_cache.lookup_or_build(pc_for_lookup, mem); b }; return PrologueOutcome::StepBlock { tid, thread_ref, block_ptr, pc_before: pc, }; } // 5) Per-instruction observation path (debugger / DB). let result = { let ctx = kernel.scheduler.ctx_mut(hw_id); if hooks { debugger.pre_step(ctx, mem); } let page_ver = mem.page_version(ctx.pc); let r = step_cached(ctx, mem, &mut wc.decode_cache, page_ver); stats.instruction_count += 1; if let Some(db) = db_writer.as_deref_mut() { let raw_before = mem.read_u32(pc); let opcode_before = xenia_cpu::decode(raw_before, pc).opcode; db.log_instruction(xenia_analysis::ExecTraceEntry { address: pc, cycle: ctx.cycle_count, r3: ctx.gpr[3], r4: ctx.gpr[4], lr: ctx.lr, sp: ctx.gpr[1], }); if opcode_before.is_branch() && ctx.pc != pc.wrapping_add(4) { let lk = (raw_before & 1) == 1; let kind: &'static str = if lk { "call" } else if opcode_before == PpcOpcode::bclrx { "return" } else if opcode_before == PpcOpcode::bcctrx { "jump" } else { "branch" }; db.log_branch(xenia_analysis::BranchTraceEntry { cycle: ctx.cycle_count, source: pc, target: ctx.pc, kind, lr: ctx.lr, }); } } if hooks { debugger.post_step(ctx, mem); } r }; match result { StepResult::Continue => {} StepResult::SystemCall => { tracing::warn!("SYSCALL at {:#010x} (hw={})", pc, hw_id); } StepResult::Unimplemented(op) => { stats.unimpl_count += 1; metrics::counter!("cpu.unimplemented").increment(1); if stats.unimpl_count <= 50 { warn!( cycle = stats.instruction_count, ?op, pc = format_args!("{:#010x}", pc), hw_id, "UNIMPL" ); } else if stats.unimpl_count == 51 { info!("suppressing further UNIMPL messages"); } } StepResult::Trap => { tracing::warn!(pc = format_args!("{:#010x}", pc), hw_id, "TRAP"); } StepResult::Halted => { info!(cycle = stats.instruction_count, hw_id, "HALTED"); let tid = kernel.scheduler.tid(hw_id); if tid == Some(INITIAL_GUEST_TID) { return PrologueOutcome::BreakOuter; } kernel.scheduler.exit_current(0); } } kernel.scheduler.decrement_quantum(); if debugger.should_break() { let pc_now = kernel.scheduler.ctx(hw_id).pc; info!( cycle = stats.instruction_count, pc = format_args!("{:#010x}", pc_now), hw_id, "BREAK" ); return PrologueOutcome::BreakOuter; } PrologueOutcome::Continue } /// Per-slot epilogue for the block-cache fast path. Charges /// quantum decrements, applies the StepResult, and returns whether /// the outer scheduler loop should continue or break. SAFETY: caller /// must guarantee `block_ptr` still points at a live `DecodedBlock` /// (it does, because the worker owns `wc.block_cache` and didn't /// re-enter `lookup_or_build` between prologue and epilogue). fn worker_epilogue( wc: &mut WorkerCtx, kernel: &mut xenia_kernel::KernelState, debugger: &mut xenia_debugger::Debugger, stats: &mut ExecStats, _tid: Option, _thread_ref: xenia_cpu::ThreadRef, block_ptr: *const xenia_cpu::block_cache::DecodedBlock, pc_before: u32, result: xenia_cpu::interpreter::StepResult, executed: u64, ) -> SlotOutcome { use xenia_cpu::interpreter::StepResult; use xenia_cpu::scheduler::INITIAL_GUEST_TID; let hw_id = wc.hw_id; let block = unsafe { &*block_ptr }; stats.instruction_count = stats.instruction_count.wrapping_add(executed); for _ in 0..executed { kernel.scheduler.decrement_quantum(); } match result { StepResult::Continue => {} StepResult::SystemCall => { let last_pc = block.instrs.last().map(|i| i.addr).unwrap_or(pc_before); tracing::warn!("SYSCALL at {:#010x} (hw={})", last_pc, hw_id); } StepResult::Unimplemented(op) => { stats.unimpl_count += 1; metrics::counter!("cpu.unimplemented").increment(1); let unimpl_pc = kernel.scheduler.ctx(hw_id).pc.wrapping_sub(4); if stats.unimpl_count <= 50 { warn!( cycle = stats.instruction_count, ?op, pc = format_args!("{:#010x}", unimpl_pc), hw_id, "UNIMPL" ); } else if stats.unimpl_count == 51 { info!("suppressing further UNIMPL messages"); } } StepResult::Trap => { let last_pc = block.instrs.last().map(|i| i.addr).unwrap_or(pc_before); tracing::warn!(pc = format_args!("{:#010x}", last_pc), hw_id, "TRAP"); } StepResult::Halted => { info!(cycle = stats.instruction_count, hw_id, "HALTED"); let tid = kernel.scheduler.tid(hw_id); if tid == Some(INITIAL_GUEST_TID) { return SlotOutcome::BreakOuter; } kernel.scheduler.exit_current(0); } } if debugger.should_break() { let pc_now = kernel.scheduler.ctx(hw_id).pc; info!( cycle = stats.instruction_count, pc = format_args!("{:#010x}", pc_now), hw_id, "BREAK" ); return SlotOutcome::BreakOuter; } SlotOutcome::Continue } #[instrument(skip_all, fields(max = ?max_instructions, ips = ?ips_limit))] fn run_execution( mem: &xenia_memory::GuestMemory, kernel: &mut xenia_kernel::KernelState, debugger: &mut xenia_debugger::Debugger, thunk_map: &HashMap, mut db_writer: Option<&mut xenia_analysis::DbWriter>, max_instructions: Option, ips_limit: Option, quiet: bool, halt_on_deadlock: bool, shutdown: Option>, ) -> ExecStats { use xenia_cpu::interpreter::step_block; let mut stats = ExecStats::default(); let _ = quiet; // retained for future per-kind suppression // `--halt-on-deadlock` CLI flag OR `XENIA_HALT_ON_DEADLOCK=1|true` env var: // when the scheduler next hits a hard deadlock (every live HW thread // blocked on a handle wait with no pending timer) we bail out with a // per-thread diagnostic instead of force-waking waiters with // STATUS_TIMEOUT. Env-var fallback is cheap and lets `just` recipes / // one-off shells flip the behaviour without rewiring the subcommand. let halt_on_deadlock = halt_on_deadlock || std::env::var("XENIA_HALT_ON_DEADLOCK") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false); // Throttle anchor: wall-clock at which instruction 0 was retired. Used to // pace execution when `ips_limit` is set — each round we compute the // expected elapsed time for the current retired-count and sleep the // difference if we're running ahead of schedule. let throttle_start = Instant::now(); // Escape hatch for A/B testing: when set, every dispatch goes // through `step_cached` even outside the debugger/DB trace modes. // Useful for pinpointing whether a behaviour regression is the // block-cache fast path's fault or something else. let force_per_instr = std::env::var("XENIA_FORCE_PER_INSTR") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false); // M3.2 / M3-realpar Step 02 — six per-HW-slot worker contexts, each // owning its own block cache + decode cache. Under per-HW-thread // parallelism (Step 04) each WorkerCtx will be moved into its own // worker thread; in lockstep it's six independent caches on the // same host thread. The block cache (1.5 MiB-ish per slot) lives // here so the prologue/epilogue split can borrow it directly. let mut workers: [WorkerCtx; xenia_cpu::scheduler::HW_THREAD_COUNT] = std::array::from_fn(|i| WorkerCtx::new(i as u8, force_per_instr)); 'outer: loop { // Per-round prologue: budget / shutdown / heartbeat / vsync / // timers / graphics-interrupt injection. Carved into // `coord_pre_round` so the parallel scheduler (Step 03+) can // call the same coordination logic between phaser barriers // without duplicating it from the lockstep path. match coord_pre_round( kernel, &stats, max_instructions, ips_limit, throttle_start, &shutdown, ) { RoundCtl::BreakOuter => break, RoundCtl::Continue => {} } // Snapshot round schedule. `round_schedule` also advances rng state // when seeded; mutation is intentional. kernel.scheduler.begin_round(); let order = kernel.scheduler.round_schedule(); if order.is_empty() { // No Ready threads — advance time to the earliest pending // deadline, fire timers, handle deadline wakes, and on hard // deadlock either halt (with diagnostics) or force-wake. // Logic carved into `coord_idle_advance`. match coord_idle_advance(kernel, halt_on_deadlock, &shutdown, &stats) { RoundCtl::BreakOuter => break, RoundCtl::Continue => continue, } } // Tier-4 perf: track guest-instruction progress within this // outer round so the post-round GPU pacer can run the GPU at a // rate proportional to executed instructions. With the // block-cache fast path executing up to MAX_BLOCK_INSTRS guest // instructions per inner iteration vs the per-instruction // path's 1, a fixed "once per round" pace would starve the // GPU when block dispatch engages. let instrs_at_round_start = stats.instruction_count; for hw_id in order { let wc = &mut workers[hw_id as usize]; match worker_prologue( wc, kernel, mem, debugger, &mut db_writer, thunk_map, &mut stats, ) { PrologueOutcome::Continue => continue, PrologueOutcome::BreakOuter => break 'outer, PrologueOutcome::StepBlock { tid, thread_ref, block_ptr, pc_before, } => { // Block-cache step. The lockstep path keeps the // kernel state borrowed straight through (single // host thread, no contention). Step 03 of the // M3 real-parallelism plan introduces a // drop-and-reacquire window around `step_block` // for the parallel branch. let cycle_before = kernel.scheduler.ctx_mut_ref(thread_ref).cycle_count; let block = unsafe { &*block_ptr }; let result = { let ctx = kernel.scheduler.ctx_mut_ref(thread_ref); step_block(ctx, mem, block) }; let executed = kernel .scheduler .ctx_mut_ref(thread_ref) .cycle_count .saturating_sub(cycle_before); match worker_epilogue( wc, kernel, debugger, &mut stats, tid, thread_ref, block_ptr, pc_before, result, executed, ) { SlotOutcome::Continue => continue, SlotOutcome::BreakOuter => break 'outer, } } } } // Per-round epilogue: end the slot visit, drive the inline // GPU proportional to instructions executed, drain GPU-side // pending interrupts, and break the outer loop if no live HW // threads remain. Carved into `coord_post_round`. match coord_post_round(kernel, mem, &stats, instrs_at_round_start) { RoundCtl::BreakOuter => break, RoundCtl::Continue => {} } } stats } /// M3 real-parallelism (Step 04) — N=6 worker threads + main-thread /// coordinator + 7-party phaser. Workers each own a `WorkerCtx` /// (per-slot block + decode caches), call `worker_prologue` / /// `step_block` / `worker_epilogue` on their slot, releasing the /// kernel mutex around the lock-free `step_block`. Coordinator runs /// `coord_pre_round` / `coord_idle_advance` / `coord_post_round` /// between two phaser barriers per round. /// /// Phaser (party_count = 7): /// - B1 (round-start gate): coordinator publishes runnable mask + drops /// kernel lock; workers wait. After B1 releases, workers see the /// mask and either (a) acquire the lock and process their slot, /// or (b) `phaser.skip(hw_id)` to skip B2. /// - B2 (round-end gate): workers signal completion. Coordinator /// reacquires the kernel lock and runs `coord_post_round`. /// /// Per-worker locking dance (when runnable): /// 1. Acquire kernel lock (stats lock briefly nested). /// 2. `worker_prologue` → `Continue` | `BreakOuter` | `StepBlock`. /// 3. On `StepBlock`: `mem::replace` ctx out; clear `scheduler.current` /// via `end_slot_visit`; drop kernel lock. /// 4. `step_block` on the local ctx (no lock held — actual parallelism). /// 5. Reacquire lock; `find_by_tid(tid)` to resolve post-migration /// ThreadRef; write ctx back; set `scheduler.current = Some(target_ref)` /// for the epilogue's `exit_current` path. /// 6. `worker_epilogue` (under lock + stats lock briefly). /// 7. Clear `scheduler.current = None`; drop kernel lock. /// 8. `phaser.arrive_and_wait(hw_id)` for B2. /// /// Lock order: kernel lock first, stats mutex second. No inversion. /// Workers never take stats without kernel; coordinator follows the /// same order. /// /// The parallel path is incompatible with debugger hooks, the DB /// writer, and `XENIA_FORCE_PER_INSTR=1` (these all force the /// per-instruction path which serializes the per-instruction /// `step_cached` under the kernel lock — no parallelism gain). We /// panic at entry rather than degrade silently. fn run_execution_parallel( mem: &xenia_memory::GuestMemory, kernel_arc: &std::sync::Arc>, debugger: &mut xenia_debugger::Debugger, thunk_map: &HashMap, mut db_writer: Option<&mut xenia_analysis::DbWriter>, max_instructions: Option, ips_limit: Option, quiet: bool, halt_on_deadlock: bool, shutdown_outer: Option>, ) -> ExecStats { use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; use xenia_cpu::interpreter::step_block; use xenia_cpu::{Phaser, PhaserOutcome}; let _ = quiet; let _ = debugger; // Verified incompatible just below; kept in signature for symmetry with run_execution. // Defense-in-depth: cmd_exec_inner should already gate these out // with a friendly error, but if we got here with hooks attached, // the parallel path can't honor them. assert!( !debugger.wants_hooks(), "--parallel mode is incompatible with debugger hooks (per-instruction path required)" ); assert!( db_writer.is_none(), "--parallel mode is incompatible with --db / DB writer" ); assert!( !std::env::var("XENIA_FORCE_PER_INSTR") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false), "--parallel mode is incompatible with XENIA_FORCE_PER_INSTR=1" ); let _ = (debugger, db_writer.take()); // suppress 'unused mut' / 'unused' on the bound param let halt_on_deadlock = halt_on_deadlock || std::env::var("XENIA_HALT_ON_DEADLOCK") .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .unwrap_or(false); let throttle_start = Instant::now(); const COORD_ID: u8 = xenia_cpu::scheduler::HW_THREAD_COUNT as u8; // = 6 const PARTY_COUNT: u32 = xenia_cpu::scheduler::HW_THREAD_COUNT as u32 + 1; let phaser: Arc = Arc::new(Phaser::new(PARTY_COUNT)); let internal_shutdown: Arc = Arc::new(AtomicBool::new(false)); let runnable_mask: [Arc; xenia_cpu::scheduler::HW_THREAD_COUNT] = std::array::from_fn(|_| Arc::new(AtomicBool::new(false))); // (Reserved for the race-free parking work deferred from Step 05; // see the Step 05 memo for the analysis. For now the per-worker // handle table is unused — workers always arrive at B1 and the // coordinator never calls unpark.) // Stats lives behind a Mutex shared between coordinator and workers. // Lock order: kernel mutex first, stats mutex second. Stats is // accessed only briefly (the duration of a helper invocation). let stats_mtx: Mutex = Mutex::new(ExecStats::default()); std::thread::scope(|scope| { for hw_id in 0..xenia_cpu::scheduler::HW_THREAD_COUNT as u8 { let phaser_w: Arc = phaser.clone(); let kernel_w = kernel_arc.clone(); let shutdown_w: Arc = internal_shutdown.clone(); let runnable_w: Arc = runnable_mask[hw_id as usize].clone(); let stats_ref: &Mutex = &stats_mtx; let mem_ref: &xenia_memory::GuestMemory = mem; let thunk_map_ref: &HashMap = thunk_map; scope .spawn(move || { // Workers run with no debugger hooks and no DB // writer (asserted in run_execution_parallel // entry). The local debugger is constructed in // the same "cold-run" shape as the main-thread // one (paused=false, step_mode=Run, // trace_enabled=false) so wants_hooks() is false // and worker_prologue takes the block-cache fast // path. Without this `Debugger::new()` defaults // to paused/trace-enabled and worker_prologue's // per-instruction path returns BreakOuter on the // first should_break check. let mut wc = WorkerCtx::new(hw_id, /*force_per_instr=*/false); let mut local_debugger = xenia_debugger::Debugger::new(); local_debugger.paused = false; local_debugger.step_mode = xenia_debugger::StepMode::Run; local_debugger.trace_enabled = false; let mut local_db_writer: Option<&mut xenia_analysis::DbWriter> = None; 'worker: loop { // B1: round-start gate. Workers ALWAYS arrive // at B1 — the phaser advance is the only race- // free synchronization with the coordinator's // mask publish. Reading runnable_w only after // B1's Release-Acquire pair guarantees we see // the current round's mask, not a stale one. match phaser_w.arrive_and_wait(hw_id) { PhaserOutcome::Advanced => {} PhaserOutcome::Shutdown => break 'worker, PhaserOutcome::Timeout => { tracing::warn!( hw_id, "worker: phaser B1 timeout — peer hung; shutting down" ); shutdown_w.store(true, Ordering::Release); phaser_w.shutdown(); break 'worker; } } if shutdown_w.load(Ordering::Acquire) { break 'worker; } // Read mask AFTER B1 advance (Acquire pair). if !runnable_w.load(Ordering::Acquire) { phaser_w.skip(hw_id); continue; } // We have work. Acquire the kernel lock. let prologue_outcome = { let mut guard = kernel_w.lock().expect("kernel mutex poisoned"); let mut s = stats_ref.lock().expect("stats mutex poisoned"); let r = worker_prologue( &mut wc, &mut *guard, mem_ref, &mut local_debugger, &mut local_db_writer, thunk_map_ref, &mut *s, ); // Drop stats lock first (reverse acquisition order). drop(s); // Don't drop the kernel guard yet for the // StepBlock arm — we need to mem::replace // before unlock. (r, guard) }; match prologue_outcome.0 { PrologueOutcome::Continue => { // Halt-sentinel restore, thunk // dispatch, or per-instruction step // ran inline. Drop the lock and arrive // at B2. drop(prologue_outcome.1); } PrologueOutcome::BreakOuter => { drop(prologue_outcome.1); shutdown_w.store(true, Ordering::Release); phaser_w.shutdown(); break 'worker; } PrologueOutcome::StepBlock { tid, thread_ref, block_ptr, pc_before, } => { let mut guard = prologue_outcome.1; // Snapshot ctx into a local; replace // the in-scheduler ctx with a fresh // (zeroed) PpcContext so peers can't // observe stale state. let mut ctx_taken = std::mem::replace( guard.scheduler.ctx_mut_ref(thread_ref), xenia_cpu::PpcContext::new(), ); let cycle_before = ctx_taken.cycle_count; // Clear scheduler.current so peers // don't see this slot as "running" // while the lock is unheld. guard.scheduler.end_slot_visit(); drop(guard); // ── unlocked window ─────────────── let block = unsafe { &*block_ptr }; let result = step_block(&mut ctx_taken, mem_ref, block); let executed = ctx_taken .cycle_count .saturating_sub(cycle_before); // ────────────────────────────────── let mut guard = kernel_w.lock().expect("kernel mutex poisoned"); // find_by_tid handles cross-worker // migration (a peer's call_export may // have moved this thread between slots). let target_ref = tid .and_then(|t| guard.scheduler.find_by_tid(t)) .unwrap_or(thread_ref); *guard.scheduler.ctx_mut_ref(target_ref) = ctx_taken; // worker_epilogue's exit_current path // expects scheduler.current to be set // to the running thread. guard.scheduler.current = Some(target_ref); let epilogue_outcome = { let mut s = stats_ref.lock().expect("stats mutex poisoned"); let r = worker_epilogue( &mut wc, &mut *guard, &mut local_debugger, &mut *s, tid, target_ref, block_ptr, pc_before, result, executed, ); drop(s); r }; // Clear current after epilogue so the // next worker that picks up the lock // doesn't observe it. guard.scheduler.current = None; drop(guard); if matches!(epilogue_outcome, SlotOutcome::BreakOuter) { shutdown_w.store(true, Ordering::Release); phaser_w.shutdown(); break 'worker; } } } // B2: round-end gate. Coordinator wakes when // all 6 workers + coordinator have arrived // (or skipped B2). match phaser_w.arrive_and_wait(hw_id) { PhaserOutcome::Advanced => {} PhaserOutcome::Shutdown => break 'worker, PhaserOutcome::Timeout => { tracing::warn!( hw_id, "worker: phaser B2 timeout — peer hung; shutting down" ); shutdown_w.store(true, Ordering::Release); phaser_w.shutdown(); break 'worker; } } } }); } // Coordinator runs on the main (calling) thread. 'outer: loop { // Pre-round under kernel + stats locks. let pre_outcome = { let mut guard = kernel_arc.lock().expect("kernel mutex poisoned"); let r = { let s = stats_mtx.lock().expect("stats mutex poisoned"); coord_pre_round( &mut *guard, &*s, max_instructions, ips_limit, throttle_start, &shutdown_outer, ) }; (r, guard) }; match pre_outcome.0 { RoundCtl::BreakOuter => { drop(pre_outcome.1); internal_shutdown.store(true, Ordering::Release); phaser.shutdown(); break 'outer; } RoundCtl::Continue => {} } let mut guard = pre_outcome.1; guard.scheduler.begin_round(); let order = guard.scheduler.round_schedule(); // Publish runnable mask for workers. All 6 workers arrive // at B1 unconditionally; the mask is published before // coord arrives so the phaser's Release on advance pairs // with workers' Acquire-after-B1 read of the mask. for id in 0..xenia_cpu::scheduler::HW_THREAD_COUNT as u8 { let runnable = order.contains(&id); runnable_mask[id as usize].store(runnable, Ordering::Release); } let instrs_at_round_start = stats_mtx.lock().expect("stats mutex poisoned").instruction_count; if order.is_empty() { let r = { let s = stats_mtx.lock().expect("stats mutex poisoned"); coord_idle_advance(&mut *guard, halt_on_deadlock, &shutdown_outer, &*s) }; drop(guard); match r { RoundCtl::BreakOuter => { internal_shutdown.store(true, Ordering::Release); phaser.shutdown(); break 'outer; } RoundCtl::Continue => { // Workers see runnable_mask all-false at B1 and // skip B2. Coordinator participates in both // barriers so the phase advances normally. match phaser.arrive_and_wait(COORD_ID) { PhaserOutcome::Advanced => {} PhaserOutcome::Shutdown => break 'outer, PhaserOutcome::Timeout => { tracing::warn!( "coordinator: phaser B1 (idle) timeout; shutting down" ); internal_shutdown.store(true, Ordering::Release); phaser.shutdown(); break 'outer; } } match phaser.arrive_and_wait(COORD_ID) { PhaserOutcome::Advanced => {} PhaserOutcome::Shutdown => break 'outer, PhaserOutcome::Timeout => { tracing::warn!( "coordinator: phaser B2 (idle) timeout; shutting down" ); internal_shutdown.store(true, Ordering::Release); phaser.shutdown(); break 'outer; } } continue; } } } drop(guard); // Release lock so workers can acquire for prologue/epilogue. // B1: signal workers to start. All 6 workers arrive at // B1 unconditionally; coord is the 7th party. match phaser.arrive_and_wait(COORD_ID) { PhaserOutcome::Advanced => {} PhaserOutcome::Shutdown => break 'outer, PhaserOutcome::Timeout => { tracing::warn!("coordinator: phaser B1 timeout; shutting down"); internal_shutdown.store(true, Ordering::Release); phaser.shutdown(); break 'outer; } } // Workers process slots concurrently. Idle-slot workers // skip B2 (the coordinator does NOT skip on their behalf — // see the race analysis in the Step 05 memo). // B2: wait for all workers to finish. match phaser.arrive_and_wait(COORD_ID) { PhaserOutcome::Advanced => {} PhaserOutcome::Shutdown => break 'outer, PhaserOutcome::Timeout => { tracing::warn!("coordinator: phaser B2 timeout; shutting down"); internal_shutdown.store(true, Ordering::Release); phaser.shutdown(); break 'outer; } } // All workers quiescent. Reacquire lock for post-round. let post_outcome = { let mut guard = kernel_arc.lock().expect("kernel mutex poisoned"); let r = { let s = stats_mtx.lock().expect("stats mutex poisoned"); coord_post_round(&mut *guard, mem, &*s, instrs_at_round_start) }; drop(guard); r }; match post_outcome { RoundCtl::BreakOuter => { internal_shutdown.store(true, Ordering::Release); phaser.shutdown(); break 'outer; } RoundCtl::Continue => {} } } }); // <- thread::scope joins all workers here. stats_mtx.into_inner().expect("stats mutex poisoned") } /// First-Pixels M2 — inject a queued graphics interrupt into HW thread 0 /// when it's safe to do so (callback registered, no interrupt already /// running). Called at the top of each scheduler round. /// /// Unlike the earlier P6 version which only delivered when HW 0 was /// `Ready`, this one also delivers when HW 0 is `Blocked`: the injector /// stashes the block reason into the new `HwState::ServicingIrq(reason)` /// variant, flips the thread to that state so `round_schedule` runs it, /// and — on callback return to `LR_HALT_SENTINEL` — the restore path /// re-creates `Blocked(reason)`, unless a `wake()` during the callback /// (e.g. `KeSetEvent` → `wake_eligible_waiters`) flipped it to `Ready`, /// in which case the wait was resolved and we leave it. /// /// This is the fix that unblocks games (like Sylpheed) which gate their /// main loop on a v-sync callback signaling an event the main thread /// waits on. The earlier "only-when-Ready" policy dropped 397 of 399 /// observed v-syncs on a 1 B-instruction Sylpheed probe; now they /// actually get delivered. fn try_inject_graphics_interrupt(kernel: &mut xenia_kernel::KernelState) { use xenia_cpu::scheduler::HwState; if kernel.interrupts.is_in_callback() { return; } let Some(cb) = kernel.interrupts.callback else { // No callback registered; drain any pending entries (they // wouldn't have made it into the queue per `queue_interrupt`'s // own `callback.is_none()` guard, but be defensive). kernel.interrupts.pending.clear(); return; }; let Some(source) = kernel.interrupts.peek_next() else { return; }; // Canary's `EmulateCPInterruptDPC` (kernel_state.cc:1373) dispatches on // whatever the current thread happens to be — real hardware fires the // interrupt on CPU 2 and the kernel impersonates a DPC on top of // whichever thread is active. Hard-anchoring to HW 0 breaks the moment // `main()` returns: Sylpheed's main thread exits right after init, the // render worker spins on a `PKEVENT` inside the interrupt callback's // user_data struct (`user_data + 0x5C`), and because HW 0 is now // `Exited(_)` our injector drops every subsequent vsync — the PKEVENT // is never signaled and the worker polls forever. // // Pick the first HW thread we can plausibly run the callback on: // 1. Prefer `Ready` (no state-mangling needed) // 2. Else take a `Blocked(reason)` thread and swap to // `ServicingIrq(reason)` so the round scheduler runs it; the // LR-sentinel restore path reinstates the block on callback return // 3. Skip `Idle`, `Exited`, or already-`ServicingIrq` slots // // The callback itself just signals a game-side event and returns — it // doesn't care which HW thread it ran on. // Pass 1: find any Ready thread across all slots. let mut victim: Option = None; 'outer_ready: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() { for (idx, t) in slot.runqueue.iter().enumerate() { if matches!(t.state, HwState::Ready) { victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16)); break 'outer_ready; } } } // Pass 2: any Blocked thread (we'll flip it to ServicingIrq). if victim.is_none() { 'outer_blocked: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() { for (idx, t) in slot.runqueue.iter().enumerate() { if matches!(t.state, HwState::Blocked(_)) { victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16)); break 'outer_blocked; } } } } let Some(target_ref) = victim else { // All threads Idle/Exited/already servicing — nothing to inject on. kernel.interrupts.take_next(); kernel.interrupts.dropped += 1; return; }; let t = kernel.scheduler.thread_mut(target_ref); let prev_state = t.state.clone(); match prev_state { HwState::Ready => {} HwState::Blocked(reason) => { t.state = HwState::ServicingIrq(reason); } _ => unreachable!("victim selection above filtered out other variants"), } let _ = kernel.interrupts.take_next(); let t = kernel.scheduler.thread_mut(target_ref); let saved = xenia_kernel::SavedCallbackCtx::capture(&t.ctx, source); kernel.interrupts.injected_ref = Some(target_ref); t.ctx.pc = cb.callback_pc; t.ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL; // Canary `Processor::Execute` decrements the guest SP by 176 before // running the callback and restores on return (see Canary // processor.cc:383). Without this pad the callback's // `__savegprlr_N` prologue stomps the interrupted function's // already-saved LR at [r1-8], so when the interrupted function // later returns via `__restgprlr_N -> bclr` it jumps to // `LR_HALT_SENTINEL` and the thread exits prematurely. Matching // restore lives in `SavedCallbackCtx::restore` (which now also // restores r1). t.ctx.gpr[1] = t .ctx .gpr[1] .wrapping_sub(xenia_kernel::interrupts::CALLBACK_STACK_PAD as u64); t.ctx.gpr[3] = source as u64; t.ctx.gpr[4] = cb.user_data as u64; kernel.interrupts.saved = Some(saved); metrics::counter!("gpu.interrupt.delivered", "source" => format!("{source}")) .increment(1); tracing::debug!( source, hw_id = target_ref.hw_id, idx = target_ref.idx, callback = format_args!("{:#010x}", cb.callback_pc), "graphics interrupt: injecting" ); } /// APUBUG-PRODUCER-001 — inject a pending XAudio buffer-complete callback /// into the next available HW thread. Mirrors /// [`try_inject_graphics_interrupt`] (same victim-selection policy, same /// SP-pad, same saved-context restore-on-sentinel) but pulls the callback /// PC + r3 argument from a registered [`xenia_kernel::xaudio::XAudioClient`] /// instead of the graphics-callback registration. /// /// Mutual exclusion with the graphics path is via the shared /// `interrupts.saved` slot — if a graphics callback is already in flight, /// `is_in_callback()` returns true and we bail until it returns to the /// `LR_HALT_SENTINEL`. fn try_inject_audio_callback(kernel: &mut xenia_kernel::KernelState) { use xenia_cpu::scheduler::HwState; if kernel.interrupts.is_in_callback() { return; } let Some(index) = kernel.xaudio.peek_next() else { return; }; let Some(client) = kernel.xaudio.get(index) else { let _ = kernel.xaudio.take_next(); kernel.xaudio.dropped += 1; return; }; let mut victim: Option = None; 'outer_ready: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() { for (idx, t) in slot.runqueue.iter().enumerate() { if matches!(t.state, HwState::Ready) { victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16)); break 'outer_ready; } } } if victim.is_none() { 'outer_blocked: for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() { for (idx, t) in slot.runqueue.iter().enumerate() { if matches!(t.state, HwState::Blocked(_)) { victim = Some(xenia_cpu::ThreadRef::new(hw_id as u8, idx as u16)); break 'outer_blocked; } } } } let Some(target_ref) = victim else { let _ = kernel.xaudio.take_next(); kernel.xaudio.dropped += 1; return; }; let t = kernel.scheduler.thread_mut(target_ref); let prev_state = t.state.clone(); match prev_state { HwState::Ready => {} HwState::Blocked(reason) => { t.state = HwState::ServicingIrq(reason); } _ => unreachable!("victim selection above filtered out other variants"), } let _ = kernel.xaudio.take_next(); let t = kernel.scheduler.thread_mut(target_ref); let saved = xenia_kernel::SavedCallbackCtx::capture( &t.ctx, xenia_kernel::INTERRUPT_SOURCE_AUDIO, ); kernel.interrupts.injected_ref = Some(target_ref); t.ctx.pc = client.callback_pc; t.ctx.lr = xenia_cpu::context::LR_HALT_SENTINEL; t.ctx.gpr[1] = t .ctx .gpr[1] .wrapping_sub(xenia_kernel::interrupts::CALLBACK_STACK_PAD as u64); t.ctx.gpr[3] = client.wrapped_callback_arg as u64; kernel.interrupts.saved = Some(saved); kernel.xaudio.delivered += 1; metrics::counter!("xaudio.callback.delivered").increment(1); tracing::debug!( index, hw_id = target_ref.hw_id, idx = target_ref.idx, callback = format_args!("{:#010x}", client.callback_pc), wrapped = format_args!("{:#010x}", client.wrapped_callback_arg), "xaudio callback: injecting" ); } fn print_summary( ctx: &xenia_cpu::PpcContext, debugger: &xenia_debugger::Debugger, db_writer: &Option, quiet: bool, ) { if !quiet { println!("\n=== Final State ==="); println!("PC: {:#010x}", ctx.pc); println!("LR: {:#010x}", ctx.lr as u32); println!("CTR: {:#010x}", ctx.ctr as u32); println!("CR: {:#010x}", ctx.cr()); println!("XER: CA={} OV={} SO={}", ctx.xer_ca, ctx.xer_ov, ctx.xer_so); for i in 0..32 { if ctx.gpr[i] != 0 { println!("r{:<2}: {:#018x}", i, ctx.gpr[i]); } } } if db_writer.is_none() { info!(entries = debugger.trace_log.len(), "in-memory trace log"); } } /// Dump a per-thread + per-handle diagnostic at end-of-run. Lets the next /// session see *which* thread is stuck *where* without having to re-derive /// it from ring-buffer / event-count math. Mirrors the halt-on-deadlock /// dump in `run_execution` but runs on the normal `-n` exit path too. fn dump_thread_diagnostic( kernel: &xenia_kernel::KernelState, mem: &xenia_memory::GuestMemory, quiet: bool, ) { if let Some((base, len, ref path)) = kernel.dump_section { let mut buf = vec![0u8; len as usize]; const CHUNK: u32 = 4096; let mut off: u32 = 0; let mut committed_pages = 0u32; while off < len { let take = CHUNK.min(len - off); let addr = base.wrapping_add(off); if mem.is_mapped(addr) { let s = off as usize; let e = s + take as usize; mem.read_bulk(addr, &mut buf[s..e]); committed_pages += 1; } off += take; } match std::fs::write(path, &buf) { Ok(()) => eprintln!( "dump-section: wrote {} bytes from {:#010x} ({} committed pages) to {}", buf.len(), base, committed_pages, path.display(), ), Err(e) => eprintln!( "dump-section: failed to write {}: {e}", path.display(), ), } } if quiet { return; } use xenia_kernel::objects::KernelObject; println!("\n=== Thread diagnostics ==="); for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() { if slot.runqueue.is_empty() { println!(" hw={} (empty)", hw_id); continue; } for (idx, t) in slot.runqueue.iter().enumerate() { println!( " hw={} idx={} tid={} state={:?} pc={:#010x} lr={:#010x} sp={:#010x}", hw_id, idx, t.tid, t.state, t.ctx.pc, t.ctx.lr as u32, t.ctx.gpr[1] as u32, ); println!( " r0={:#010x} r3={:#010x} r4={:#010x} r5={:#010x} r6={:#010x} r7={:#010x}", t.ctx.gpr[0] as u32, t.ctx.gpr[3] as u32, t.ctx.gpr[4] as u32, t.ctx.gpr[5] as u32, t.ctx.gpr[6] as u32, t.ctx.gpr[7] as u32, ); println!( " r8={:#010x} r9={:#010x} r10={:#010x} r11={:#010x} r12={:#010x} r13={:#010x}", t.ctx.gpr[8] as u32, t.ctx.gpr[9] as u32, t.ctx.gpr[10] as u32, t.ctx.gpr[11] as u32, t.ctx.gpr[12] as u32, t.ctx.gpr[13] as u32, ); } } let tid_of = |r: &xenia_cpu::ThreadRef| -> u32 { kernel .scheduler .slots .get(r.hw_id as usize) .and_then(|s| s.runqueue.get(r.idx as usize)) .map(|t| t.tid) .unwrap_or(0) }; let mut any_obj = false; for (h, obj) in kernel.objects.iter() { let (kind, waiters) = match obj { KernelObject::Event { signaled, manual_reset, waiters } => { (format!("Event(sig={}, mr={})", signaled, manual_reset), waiters) } KernelObject::Semaphore { count, max, waiters } => { (format!("Semaphore({}/{})", count, max), waiters) } KernelObject::Thread { id, exit_code, waiters, .. } => { (format!("Thread(id={}, exit={:?})", id, exit_code), waiters) } KernelObject::Timer { signaled, deadline, waiters, .. } => { (format!("Timer(sig={}, deadline={:?})", signaled, deadline), waiters) } KernelObject::Mutex { owner, recursion, waiters } => { (format!("Mutex(owner={:?}, rec={})", owner, recursion), waiters) } KernelObject::NotifyListener { mask, queue, waiters, .. } => { (format!("NotifyListener(mask={:#x}, pending={})", mask, queue.len()), waiters) } KernelObject::File { .. } => continue, }; if waiters.is_empty() { continue; } if !any_obj { println!("\n -- Handle waiter lists --"); any_obj = true; } let tids: Vec = waiters.iter().map(&tid_of).collect(); println!(" handle={:#010x} {} waiters(tid)={:?}", h, kind, tids); } if !kernel.cs_waiters.is_empty() { let mut any_cs = false; for (cs_ptr, waiters) in kernel.cs_waiters.iter() { if waiters.is_empty() { continue; } if !any_cs { println!("\n -- Critical section waiters --"); any_cs = true; } let tids: Vec = waiters.iter().map(&tid_of).collect(); println!(" cs={:#010x} waiters(tid)={:?}", cs_ptr, tids); } } // Audit trails (only when --trace-handles flipped the flag). For each // tracked handle, emit a compact block: kind, creator, and the bounded // ring of signal/wait/wake events. Highlight handles with waiters and // zero signals — the smoking gun for a missing kernel API. if kernel.audit.enabled { println!("\n=== Handle audit trails ==="); // Suspect set from project memory: handles stuck on Sylpheed boot. const SUSPECT: [u32; 5] = [0x10FC, 0x1014, 0x1104, 0x10DC, 0x10F0]; // Sort by handle for stable output. let mut handles: Vec = kernel.audit.trails.keys().copied().collect(); handles.sort(); for h in handles { let trail = &kernel.audit.trails[&h]; let waiter_count = kernel .objects .get(&h) .and_then(|o| match o { KernelObject::Event { waiters, .. } | KernelObject::Semaphore { waiters, .. } | KernelObject::Thread { waiters, .. } | KernelObject::Timer { waiters, .. } | KernelObject::Mutex { waiters, .. } | KernelObject::NotifyListener { waiters, .. } => Some(waiters.len()), KernelObject::File { .. } => None, }) .unwrap_or(0); let suspect = SUSPECT.contains(&h); let interesting = waiter_count > 0 || trail.signals.is_empty() && !trail.waits.is_empty() || suspect; if !interesting { continue; } let marker = if suspect { " " } else { "" }; let smoking = if !trail.waits.is_empty() && trail.signals.is_empty() { " " } else { "" }; println!( " handle={:#010x} kind={} waiters={} signals={} waits={} wakes={}{}{}", h, trail.kind, waiter_count, trail.signals.len(), trail.waits.len(), trail.wakes.len(), marker, smoking, ); println!( " created cycle={} tid={} lr={:#010x} src={}", trail.created.cycle, trail.created.tid, trail.created.lr, trail.created.source, ); // Print the first few of each ring — full dump can be re-run with // a larger AUDIT_RING_CAPACITY if needed. let head = 4usize; for (label, ring) in [ ("waits", &trail.waits), ("signals", &trail.signals), ("wakes", &trail.wakes), ] { if ring.is_empty() { continue; } println!(" {} ({}):", label, ring.len()); for (i, e) in ring.iter().take(head).enumerate() { println!( " [{:>2}] cycle={} tid={} lr={:#010x} src={} aux={:#x}", i, e.cycle, e.tid, e.lr, e.source, e.aux, ); } if ring.len() > head { println!(" ... ({} more)", ring.len() - head); } } } // KRNBUG-AUDIT-001: per-handle FOCUS report. For each handle in // `--trace-handles-focus`, emit a unified diagnosis: how many // signal attempts (primary + ghost) we saw, classified by // producer-class (GuestExport vs KernelInternal vs // InterruptCallback), so we can deterministically distinguish: // - "guest never tried" → producer is a missing kernel signal source // - "signal landed but waiter wasn't woken" → wake-eligibility bug if !kernel.audit.focus.is_empty() { println!("\n=== Handle audit (focus) ==="); let mut focus: Vec = kernel.audit.focus.iter().copied().collect(); focus.sort(); for h in focus { let waiter_count = kernel .objects .get(&h) .and_then(|o| match o { KernelObject::Event { waiters, .. } | KernelObject::Semaphore { waiters, .. } | KernelObject::Thread { waiters, .. } | KernelObject::Timer { waiters, .. } | KernelObject::Mutex { waiters, .. } | KernelObject::NotifyListener { waiters, .. } => Some(waiters.len()), KernelObject::File { .. } => None, }) .unwrap_or(0); let signaled = kernel .objects .get(&h) .and_then(|o| match o { KernelObject::Event { signaled, .. } => Some(*signaled), _ => None, }); let primary = kernel.audit.trails.get(&h); let ghost = kernel.audit.ghost_trails.get(&h); let kind_str = primary.map(|t| t.kind).unwrap_or(""); let primary_signals = primary.map(|t| t.signals.len()).unwrap_or(0); let ghost_signals = ghost.map(|g| g.signals.len()).unwrap_or(0); let total_signals = primary_signals + ghost_signals; let primary_waits = primary.map(|t| t.waits.len()).unwrap_or(0); let primary_wakes = primary.map(|t| t.wakes.len()).unwrap_or(0); let audit_blind = waiter_count > 0 && primary_waits == 0 && primary_wakes == 0; println!( " handle={:#010x} kind={} waiters={} signaled={} signal_attempts={} (primary={}, ghost={}) waits={} wakes={}{}", h, kind_str, waiter_count, signaled .map(|s| if s { "true" } else { "false" }) .unwrap_or("?"), total_signals, primary_signals, ghost_signals, primary_waits, primary_wakes, if audit_blind { " " } else { "" }, ); if let Some(t) = primary { println!( " created cycle={} tid={} lr={:#010x} src={}", t.created.cycle, t.created.tid, t.created.lr, t.created.source, ); if !t.created_stack.is_empty() { println!(" created stack ({} frames):", t.created_stack.len()); for (i, (fp, lr)) in t.created_stack.iter().enumerate() { println!( " [{:>2}] fp={:#010x} lr={:#010x}", i, fp, lr, ); } } if !t.created_class_probes.is_empty() { println!( " created-class probes ({}):", t.created_class_probes.len(), ); for line in &t.created_class_probes { println!(" {}", line); } } } // Producer-class classification. Source strings are stable // labels passed to `record_signal` at the export sites. // Anything not in the GuestExport set is treated as // KernelInternal (e.g. signal_io_completion_event). let guest_export_sources = [ "KeSetEvent", "NtSetEvent", "KePulseEvent", "NtPulseEvent", "KeReleaseSemaphore", "NtReleaseSemaphore", "NtSignalAndWaitForSingleObjectEx", ]; let mut guest_export = 0usize; let mut kernel_internal = 0usize; let count_signals = |signals: &std::collections::VecDeque< xenia_kernel::audit::HandleAuditEntry, >, guest_export: &mut usize, kernel_internal: &mut usize| { for e in signals { if guest_export_sources.contains(&e.source) { *guest_export += 1; } else { *kernel_internal += 1; } } }; if let Some(p) = primary { count_signals(&p.signals, &mut guest_export, &mut kernel_internal); } if let Some(g) = ghost { count_signals(&g.signals, &mut guest_export, &mut kernel_internal); } // Compact merged timeline (cycle-sorted) of waits + signals. let mut tl: Vec<(u64, &str, &xenia_kernel::audit::HandleAuditEntry)> = Vec::new(); if let Some(p) = primary { for e in &p.waits { tl.push((e.cycle, "wait", e)); } for e in &p.signals { tl.push((e.cycle, "signal", e)); } } if let Some(g) = ghost { for e in &g.signals { tl.push((e.cycle, "signal[ghost]", e)); } } tl.sort_by_key(|(c, _, _)| *c); if !tl.is_empty() { println!(" timeline (last 16 of {}):", tl.len()); let start = tl.len().saturating_sub(16); for (cycle, label, e) in &tl[start..] { println!( " cycle={} tid={} lr={:#010x} src={}[{}] aux={:#x}", cycle, e.tid, e.lr, e.source, label, e.aux, ); } } // -- DIAGNOSIS -- println!(" -- DIAGNOSIS --"); println!( " GuestExport={} KernelInternal={} waits={}", guest_export, kernel_internal, primary_waits, ); let conclusion = if total_signals == 0 && audit_blind { "waiter parked via non-audited path (CS / spinlock / DPC / sync-primitive helper) and no signals — instrument that wait path or check ObReferenceObjectByHandle plumbing" } else if total_signals == 0 && primary_waits > 0 { "producer is a missing kernel signal source (or BST-paradox upstream)" } else if total_signals == 0 && primary_waits == 0 { "no waits and no signals — handle may not be reached during this run" } else if guest_export > 0 && waiter_count > 0 { "signals landed but waiter still parked — wake-eligibility / mr-bit bug" } else if kernel_internal > 0 && waiter_count > 0 { "kernel-internal signal landed but waiter still parked — wake or routing bug" } else { "not stuck — signals consumed correctly" }; println!(" => {}", conclusion); // KRNBUG-AUDIT-003. For each thread parked on this focus // handle, walk its back-chain and probe `this` candidates // for an MSVC C++ class name via vtable[-4] → COL → TD. // Frame 0 uses the live r31/r30/r3 of the parked thread; // deeper frames recover saved r31/r30 from the standard // PPC prologue spill area at [fp - 12] / [fp - 16]. This // names the dispatcher / queue / pool that owns each // parked thread, converting "who should signal handle X" // into "who should call Class::Submit but doesn't." if let Some(obj) = kernel.objects.get(&h) { let waiters: Vec = match obj { KernelObject::Event { waiters, .. } | KernelObject::Semaphore { waiters, .. } | KernelObject::Timer { waiters, .. } | KernelObject::Thread { waiters, .. } | KernelObject::Mutex { waiters, .. } | KernelObject::NotifyListener { waiters, .. } => waiters.iter().copied().collect(), KernelObject::File { .. } => Vec::new(), }; for w in waiters { let Some(slot) = kernel.scheduler.slots.get(w.hw_id as usize) else { continue; }; let Some(thread) = slot.runqueue.get(w.idx as usize) else { continue; }; let ctx = &thread.ctx; let frames = xenia_kernel::state::walk_guest_back_chain( ctx.gpr[1] as u32, ctx.lr as u32, mem, 6, ); println!( " WAIT-THREAD tid={} pc={:#010x} lr={:#010x} sp={:#010x} r3={:#010x} r30={:#010x} r31={:#010x}", thread.tid, ctx.pc, ctx.lr as u32, ctx.gpr[1] as u32, ctx.gpr[3] as u32, ctx.gpr[30] as u32, ctx.gpr[31] as u32, ); for (frame_idx, (fp, frame_lr)) in frames.iter().enumerate() { let saved_r31 = mem.read_u32(fp.wrapping_sub(12)); let saved_r30 = mem.read_u32(fp.wrapping_sub(16)); let saved_r29 = mem.read_u32(fp.wrapping_sub(20)); let saved_r28 = mem.read_u32(fp.wrapping_sub(24)); println!( " FRAME {} fp={:#010x} lr={:#010x} saved r31={:#010x} r30={:#010x} r29={:#010x} r28={:#010x}", frame_idx, fp, frame_lr, saved_r31, saved_r30, saved_r29, saved_r28, ); let candidates: Vec<(u32, &'static str)> = if frame_idx == 0 { vec![ (ctx.gpr[31] as u32, "r31"), (ctx.gpr[30] as u32, "r30"), (ctx.gpr[29] as u32, "r29"), (ctx.gpr[28] as u32, "r28"), (ctx.gpr[3] as u32, "r3"), ] } else { vec![ (saved_r31, "saved-r31"), (saved_r30, "saved-r30"), (saved_r29, "saved-r29"), (saved_r28, "saved-r28"), ] }; for (this_ptr, label) in candidates { use xenia_kernel::state::ClassReadout; match xenia_kernel::state::read_class_at_this(this_ptr, mem) { ClassReadout::Named { vtable, mangled } => { println!( " WAIT-CLASS frame={} {}={:#010x} vtable={:#010x} class={}", frame_idx, label, this_ptr, vtable, mangled, ); } ClassReadout::VtableOnly { vtable, virtuals } => { println!( " WAIT-CLASS frame={} {}={:#010x} vtable={:#010x} virtuals=[{:#010x},{:#010x},{:#010x},{:#010x}] (RTTI stripped)", frame_idx, label, this_ptr, vtable, virtuals[0], virtuals[1], virtuals[2], virtuals[3], ); } ClassReadout::NotAnObject => {} } } } } } } } } // Diagnostic. `--dump-addr` content dump at end-of-run. Each // address gets a 64-byte dump in three forms (raw bytes, u32 // big-endian lanes, ASCII guess). The lanes form is gold for // pool-element / dispatcher struct reverse-engineering: at // [base+0]=-1 means "empty queue" sentinel from AUDIT-003, while // any image-range pointer `0x82xxxxxx` at a fixed offset hints // at an embedded silph::Event* / vtable / next-link. if !kernel.dump_addrs.is_empty() { println!("\n=== Dump-addr ==="); const DUMP_BYTES: usize = 128; for &addr in &kernel.dump_addrs { println!(" addr={:#010x}", addr); let mut bytes = [0u8; DUMP_BYTES]; for i in 0..DUMP_BYTES { bytes[i] = mem.read_u8(addr.wrapping_add(i as u32)); } for row in 0..(DUMP_BYTES / 16) { let off = row * 16; let hex: String = (0..16) .map(|i| format!("{:02x}", bytes[off + i])) .collect::>() .join(" "); let words: String = (0..4) .map(|i| { let b = &bytes[off + i * 4..off + i * 4 + 4]; format!( "{:08x}", u32::from_be_bytes([b[0], b[1], b[2], b[3]]) ) }) .collect::>() .join(" "); let ascii: String = (0..16) .map(|i| { let b = bytes[off + i]; if (0x20..=0x7E).contains(&b) { b as char } else { '.' } }) .collect(); println!( " +{:#04x}: {} | be32={} | {}", off, hex, words, ascii, ); } } } } #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(title))] fn run_with_ui( title: &str, mut mem: xenia_memory::GuestMemory, mut kernel: xenia_kernel::KernelState, mut debugger: xenia_debugger::Debugger, thunk_map: HashMap, mut db_writer: Option, max_instructions: Option, ips_limit: Option, quiet: bool, halt_on_deadlock: bool, started: Instant, ) -> Result<()> { use winit::event_loop::EventLoop; let event_loop = EventLoop::::with_user_event() .build() .map_err(|e| anyhow::anyhow!("winit event loop build failed: {e}"))?; let (ui_handles, kernel_bridge) = xenia_ui::build(event_loop.create_proxy()); kernel.ui = Some(kernel_bridge); let shutdown = std::sync::Arc::clone(&ui_handles.shutdown); let title_owned = std::path::Path::new(title) .file_name() .and_then(|n| n.to_str()) .unwrap_or(title) .to_string(); let worker_span = tracing::info_span!("cpu_worker"); let worker = std::thread::Builder::new() .name("xenia-cpu".into()) .spawn(move || -> Result<(ExecStats, xenia_memory::GuestMemory, xenia_kernel::KernelState, xenia_debugger::Debugger, Option)> { let _guard = worker_span.enter(); let stats = run_execution( &mut mem, &mut kernel, &mut debugger, &thunk_map, db_writer.as_mut(), max_instructions, ips_limit, quiet, halt_on_deadlock, Some(shutdown), ); if let Some(ref mut db) = db_writer { db.finalize_traces()?; } Ok((stats, mem, kernel, debugger, db_writer)) }) .map_err(|e| anyhow::anyhow!("spawn CPU worker: {e}"))?; // Run the UI — blocks until the user closes the window or the worker // flips the shutdown flag itself (e.g. after max_instructions). xenia_ui::run(event_loop, ui_handles, &title_owned)?; let (stats, mem, kernel, debugger, db_writer) = match worker.join() { Ok(res) => res?, Err(_) => { return Err(anyhow::anyhow!("CPU worker thread panicked")); } }; print_summary(kernel.scheduler.ctx(0), &debugger, &db_writer, quiet); dump_thread_diagnostic(&kernel, &mem, quiet); info!( wall_ms = started.elapsed().as_millis() as u64, instructions = stats.instruction_count, import_calls = stats.import_count, unimplemented = stats.unimpl_count, "exec --ui complete" ); Ok(()) } #[instrument(skip_all, fields(path = %path))] fn cmd_browse(path: &str) -> Result<()> { use xenia_vfs::VfsDevice; let disc = xenia_vfs::disc_image::DiscImageDevice::open("disc", std::path::Path::new(path)) .map_err(|e| anyhow::anyhow!("Failed to open disc image: {}", e))?; println!("=== XISO Contents: {} ===", path); match disc.list_root() { Ok(entries) => { for entry in entries { let kind = if entry.is_directory { "DIR " } else { "FILE" }; println!(" {} {:>10} {}", kind, entry.size, entry.name); } } Err(e) => tracing::error!(%e, "error listing contents"), } Ok(()) } /// Helper: load XEX, parse header, decompress PE, resolve imports, parse sections. #[instrument(skip_all, fields(path = %path))] fn load_and_prepare(path: &str) -> Result<(xenia_xex::Xex2Header, Vec, Vec)> { let data = load_xex_data(path)?; let mut header = xenia_xex::loader::parse_xex2_header(&data)?; let entry = xenia_xex::loader::get_entry_point(&header) .ok_or_else(|| anyhow::anyhow!("No entry point found in XEX2 header"))?; let base = xenia_xex::loader::get_image_base(&header) .ok_or_else(|| anyhow::anyhow!("No image base found in XEX2 header"))?; info!( entry = format_args!("{:#010x}", entry), base = format_args!("{:#010x}", base), "XEX entry/base" ); let pe_image = xenia_xex::loader::load_image(&data, &header)?; info!(bytes = pe_image.len(), "image decompressed"); // Resolve import ordinals and record types from the PE image xenia_xex::loader::resolve_imports(&mut header, &pe_image); // Parse PE sections let sections = xenia_xex::pe::parse_sections(&pe_image)?; info!(sections = sections.len(), "parsed PE sections"); Ok((header, pe_image, sections)) } #[instrument(skip_all, fields(path = %path))] fn cmd_extract(path: &str, output_dir: Option<&str>, db_path: Option<&str>) -> Result<()> { use serde::Serialize; let (header, pe_image, sections) = load_and_prepare(path)?; let entry = xenia_xex::loader::get_entry_point(&header).unwrap(); let base = xenia_xex::loader::get_image_base(&header).unwrap(); let image_size = header.security_info.as_ref().map(|s| s.image_size).unwrap_or(0); // Build JSON-serializable info struct #[derive(Serialize)] struct Xex2Info<'a> { module_flags: u32, image_base: u32, entry_point: u32, image_size: u32, original_pe_name: Option<&'a str>, execution_info: &'a Option, import_libraries: &'a [xenia_xex::header::ImportLibrary], sections: &'a [xenia_xex::pe::PeSection], } let info = Xex2Info { module_flags: header.module_flags, image_base: base, entry_point: entry, image_size, original_pe_name: header.original_pe_name.as_deref(), execution_info: &header.execution_info, import_libraries: &header.import_libraries, sections: §ions, }; // Determine output directory let input_path = std::path::Path::new(path); let out_dir = match output_dir { Some(d) => std::path::PathBuf::from(d), None => input_path.parent().unwrap_or(std::path::Path::new(".")).to_path_buf(), }; std::fs::create_dir_all(&out_dir)?; let stem = input_path.file_stem() .and_then(|s| s.to_str()) .unwrap_or("output"); // Write PE image let pe_path = out_dir.join(format!("{stem}.pe")); std::fs::write(&pe_path, &pe_image)?; info!( path = %pe_path.display(), bytes = pe_image.len(), "wrote PE image" ); // Write JSON metadata let json_path = out_dir.join(format!("{stem}.xex.json")); let json = serde_json::to_string_pretty(&info)?; std::fs::write(&json_path, &json)?; info!(path = %json_path.display(), "wrote metadata JSON"); // Print summary let total_imports: usize = header.import_libraries.iter().map(|l| l.imports.len()).sum(); println!("Extracted: {} sections, {} import libraries ({} imports)", sections.len(), header.import_libraries.len(), total_imports); if let Some(ref ei) = header.execution_info { println!("Title ID: 0x{:08X} Media ID: 0x{:08X}", ei.title_id, ei.media_id); } // Write base tables to SQLite if requested if let Some(db) = db_path { let disasm_info = xenia_analysis::formatter::DisasmInfo { image_base: base, entry_point: entry, original_pe_name: header.original_pe_name.as_deref(), title_id: header.execution_info.as_ref().map(|e| e.title_id), media_id: header.execution_info.as_ref().map(|e| e.media_id), sections: §ions, import_libraries: &header.import_libraries, }; info!(db = %db, "writing base tables"); let mut w = xenia_analysis::DbWriter::open_fresh(std::path::Path::new(db))?; w.write_base(&disasm_info)?; info!(db = %db, "database written"); } Ok(()) } #[instrument(skip_all, fields(path = %path))] fn cmd_dis( path: &str, output: Option<&str>, db_path: Option<&str>, json_path: Option<&str>, analyze: AnalyzeMode, quiet: bool, ) -> Result<()> { use std::collections::HashMap; let started = Instant::now(); let (header, pe_image, sections) = load_and_prepare(path)?; let entry = xenia_xex::loader::get_entry_point(&header).unwrap(); let base = xenia_xex::loader::get_image_base(&header).unwrap(); // Build import address -> name map let mut import_map: HashMap = HashMap::new(); for lib in &header.import_libraries { for imp in &lib.imports { let resolved = xenia_analysis::resolve_ordinal(&lib.name, imp.ordinal); let name = match resolved { Some(n) => format!("{}::{}", lib.name, n), None => format!("{}::ordinal_{:#06X}", lib.name, imp.ordinal), }; import_map.insert(imp.address, name); } } info!(thunks = import_map.len(), "resolved import thunks"); // Function analysis let code_sections: Vec<(u32, u32, u32)> = sections.iter() .filter(|s| s.is_code()) .map(|s| (s.virtual_address, s.virtual_size, s.flags)) .collect(); let func_analysis = xenia_analysis::func::analyze(&pe_image, base, entry, &code_sections); info!(functions = func_analysis.functions.len(), "function detection complete"); // Cross-reference analysis let xref_result = xenia_analysis::xref::analyze_xrefs( &pe_image, base, entry, §ions, &func_analysis, &import_map, ); let total_xrefs: usize = xref_result.xrefs.values().map(|v| v.len()).sum(); info!( labels = xref_result.labels.len(), xrefs = total_xrefs, "xref analysis complete" ); // Build DisasmInfo let disasm_info = xenia_analysis::formatter::DisasmInfo { image_base: base, entry_point: entry, original_pe_name: header.original_pe_name.as_deref(), title_id: header.execution_info.as_ref().map(|e| e.title_id), media_id: header.execution_info.as_ref().map(|e| e.media_id), sections: §ions, import_libraries: &header.import_libraries, }; // SQLite database output (base + ingest + analyze layers) if let Some(db) = db_path { info!(db = %db, analyze = ?analyze, "writing database"); let mut w = xenia_analysis::DbWriter::open_fresh(std::path::Path::new(db))?; w.write_base(&disasm_info)?; w.ingest_instructions(&pe_image, &disasm_info, &func_analysis, &xref_result.labels)?; w.write_analysis_results( &pe_image, &disasm_info, &func_analysis, &xref_result.labels, &xref_result.xrefs, )?; if matches!(analyze, AnalyzeMode::Sql | AnalyzeMode::Both) { w.create_sql_views()?; info!(db = %db, "SQL views created"); } if matches!(analyze, AnalyzeMode::Both) { let (sql_only, rust_only) = w.cross_check_branch_xrefs()?; if sql_only == 0 && rust_only == 0 { info!(db = %db, "Rust/SQL branch xrefs agree"); } else { tracing::warn!( db = %db, sql_only, rust_only, "Rust/SQL branch xref disagreement — investigate formatter mnemonic vs xref.rs kind classification" ); } } info!(db = %db, "database written"); } // JSON Lines output: one row per instruction, structured columns. if let Some(json) = json_path { info!(json = %json, "writing JSON Lines"); let mut out = std::io::BufWriter::new(std::fs::File::create(json)?); let mut total: u64 = 0; for section in §ions { if !section.is_code() { continue; } let abs_start = base + section.virtual_address; let abs_end = abs_start + section.virtual_size; let items = xenia_analysis::enrich_section( &pe_image, base, §ion.name, abs_start, abs_end, &func_analysis, &xref_result.labels, ); total += xenia_analysis::sinks::json::write_jsonl(&mut out, items)?; } info!(json = %json, rows = total, "JSON Lines written"); } // Assembly output (skipped when --quiet and no --output specified) if !quiet || output.is_some() { let mut out: Box = match output { Some(path) => Box::new(std::io::BufWriter::new(std::fs::File::create(path)?)), None => Box::new(std::io::BufWriter::new(std::io::stdout().lock())), }; xenia_analysis::formatter::write_asm( &mut *out, &pe_image, &disasm_info, &func_analysis, &xref_result.labels, &import_map, &xref_result.xrefs, &xref_result.data_annotations, )?; if let Some(path) = output { info!(path, "wrote disassembly"); } } info!(wall_ms = started.elapsed().as_millis() as u64, "dis complete"); Ok(()) } #[cfg(test)] mod tests { use super::parse_hex_u32; #[test] fn parse_hex_u32_accepts_0x_prefix() { assert_eq!(parse_hex_u32("0x824be9a0").unwrap(), 0x824be9a0); assert_eq!(parse_hex_u32("0X82000000").unwrap(), 0x82000000); } #[test] fn parse_hex_u32_accepts_bare_hex() { // No 0x prefix, contains hex letters — treated as hex. assert_eq!(parse_hex_u32("824be9a0").unwrap(), 0x824be9a0); } #[test] fn parse_hex_u32_accepts_decimal() { // All digits, no 0x — treated as decimal. assert_eq!(parse_hex_u32("1000").unwrap(), 1000); assert_eq!(parse_hex_u32("0").unwrap(), 0); } #[test] fn parse_hex_u32_rejects_garbage() { assert!(parse_hex_u32("not a number").is_err()); assert!(parse_hex_u32("0xZZZ").is_err()); } #[test] fn parse_hex_u32_tolerates_whitespace() { assert_eq!(parse_hex_u32(" 0x82000000 ").unwrap(), 0x82000000); } }