From 23189b95aff2b1b3d18f64a14714ee61c1eb528d Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 21 Jun 2026 21:38:19 +0200 Subject: [PATCH] [iterate-4A] Milestone-2: XMA audio decoder + RE tooling (dispatch recorder, analyzer vtable-fix, non-perturbing probes) Milestone-2 (intro video dat/movie/ADV.wmv) audio path + major RE tooling. XMA AUDIO (built, working, deterministic, tested): - APU MMIO 0x7FEA0000 + 320x64B register-mapped context array; real XMACreateContext/Release (xma.rs); real FFmpeg xma2 decoder XMA_CONTEXT_DATA->S16BE PCM (xma_decode.rs, xma2_codec.rs, ffmpeg-sys-next). Decode runs synchronously on the CPU thread (deterministic, no host thread). - Audio-worker scheduler fix (main.rs LR_HALT restore + scheduler.rs): the XAudio render-callback worker was wrongly exited after ~2 deliveries; now survives -> guest drives XMA decode (70 kicks). - XAudioSubmitRenderDriverFrame made faithful. Golden sylpheed_n50m re-baselined; tests pass. RE TOOLING: - Runtime indirect-dispatch recorder (dispatch_rec.rs): records (call-site->target, r3, lr); env-gated XENIA_DISPATCH_REC, filters XENIA_DISPATCH_REC_TARGETS/_SITES; deterministic, observe-only. - Repaired static analyzer (vtables.rs): vtable extraction silently fragmented vtables with non-function head slots (missed the XMV engine vtable). Fixed via vptr-write-anchoring -> engine fully typed (vtables 722->1150 on rebuild). - Fixed probe HEISENBUG (main.rs run_superblock): --audit-pc-probe-hex/--mem-watch no longer disable superblock chaining; probes fire inside the chain loop -> scheduling identical armed-vs-unarmed, movie subsystem now observable. Fixed a --quiet bug swallowing armed trace reports. VIDEO still doesn't play (B, guest-side): the XMV engine never issues begin-playback (sub_825076F0, vtable 0x8200a1e8 slot21) -> never primes -> 2000ms timeout. Narrowed to the ARM2 engine-setup wrappers; no honest our-side gate-fix (masking forbidden). See HANDOFF-iterate-4A-milestone2.md for new-machine setup (incl. the FFmpeg apt deps + sylpheed.db regeneration) and continuation pointers. Co-Authored-By: Claude Opus 4.8 --- .gitignore | 4 + Cargo.lock | 101 ++ HANDOFF-iterate-4A-milestone2.md | 133 +++ crates/xenia-analysis/src/vtables.rs | 349 +++++++ crates/xenia-app/src/main.rs | 259 ++++- .../xenia-app/tests/golden/sylpheed_n50m.json | 8 +- crates/xenia-apu/Cargo.toml | 7 + crates/xenia-apu/src/lib.rs | 6 + crates/xenia-apu/src/xma.rs | 932 ++++++++++++++++++ crates/xenia-apu/src/xma2_codec.rs | 217 ++++ crates/xenia-apu/src/xma_decode.rs | 690 +++++++++++++ crates/xenia-cpu/src/dispatch_rec.rs | 217 ++++ crates/xenia-cpu/src/interpreter.rs | 8 +- crates/xenia-cpu/src/lib.rs | 1 + crates/xenia-cpu/src/scheduler.rs | 71 ++ crates/xenia-kernel/Cargo.toml | 1 + crates/xenia-kernel/src/exports.rs | 92 +- crates/xenia-kernel/src/state.rs | 8 + crates/xenia-kernel/src/xaudio.rs | 48 + 19 files changed, 3106 insertions(+), 46 deletions(-) create mode 100644 HANDOFF-iterate-4A-milestone2.md create mode 100644 crates/xenia-apu/src/xma.rs create mode 100644 crates/xenia-apu/src/xma2_codec.rs create mode 100644 crates/xenia-apu/src/xma_decode.rs create mode 100644 crates/xenia-cpu/src/dispatch_rec.rs diff --git a/.gitignore b/.gitignore index 740cb72..530be76 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,7 @@ audit-*.md # working dir by the Wine canary build) vkd3d-proton.cache* *.dxvk-cache + +# local analysis-DB backups (regenerable; too large to track) +*.db.bak* +sylpheed.db.bak-* diff --git a/Cargo.lock b/Cargo.lock index cbd6c5e..a82b968 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -418,6 +418,26 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bindgen" +version = "0.64.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4" +dependencies = [ + "bitflags 1.3.2", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 1.0.109", +] + [[package]] name = "bit-set" version = "0.6.0" @@ -600,6 +620,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -639,6 +668,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.6.0" @@ -1076,6 +1116,20 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" +[[package]] +name = "ffmpeg-sys-next" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2529ad916d08c3562c754c21bc9b17a26c7882c0f5706cc2cd69472175f1620" +dependencies = [ + "bindgen", + "cc", + "libc", + "num_cpus", + "pkg-config", + "vcpkg", +] + [[package]] name = "filetime" version = "0.2.27" @@ -1317,6 +1371,12 @@ dependencies = [ "xml-rs", ] +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + [[package]] name = "glow" version = "0.13.1" @@ -1898,6 +1958,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "lexical-core" version = "1.0.6" @@ -2139,6 +2205,12 @@ dependencies = [ "sketches-ddsketch", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -2262,6 +2334,16 @@ dependencies = [ "libc", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -2325,6 +2407,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "num_enum" version = "0.7.6" @@ -2657,6 +2749,12 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -4961,8 +5059,10 @@ dependencies = [ name = "xenia-apu" version = "0.1.0" dependencies = [ + "ffmpeg-sys-next", "thiserror 2.0.18", "tracing", + "xenia-memory", "xenia-types", ] @@ -5025,6 +5125,7 @@ dependencies = [ "metrics", "thiserror 2.0.18", "tracing", + "xenia-apu", "xenia-cpu", "xenia-gpu", "xenia-hid", diff --git a/HANDOFF-iterate-4A-milestone2.md b/HANDOFF-iterate-4A-milestone2.md new file mode 100644 index 0000000..ad1ff84 --- /dev/null +++ b/HANDOFF-iterate-4A-milestone2.md @@ -0,0 +1,133 @@ +# Handoff — branch `iterate-4A/apu-xma-stage1` (Milestone 2: intro-video / XMA audio + RE tooling) + +Reverse-engineering Project Sylpheed under this Rust Xbox-360 emulator (`xenia-rs`), using Wine +xenia-canary as the ground-truth oracle. This branch carries **Milestone 2** work plus major +RE-tooling improvements, on top of the (uncommitted-until-now) Milestone-1 renderer history. + +> Method: first-divergence vs canary · fix causes not symptoms · NO faking/masking · measure the +> oracle, never infer · refute before believing · ground every claim in evidence. + +--- + +## 0. SET UP ON A NEW MACHINE (do this first) + +### a) FFmpeg system libraries — **REQUIRED to build** (crate `xenia-apu` links them via pkg-config) +The XMA audio decoder uses `ffmpeg-sys-next` (`crates/xenia-apu/Cargo.toml`: +`ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] }`), +which links the **system** FFmpeg dev libraries. Install them: + +```bash +sudo apt update +sudo apt install -y libavcodec-dev libavformat-dev libavutil-dev libswresample-dev pkg-config ffmpeg +``` + +Verify the toolchain (the XMA path needs the `xma1`/`xma2` decoders — present in distro FFmpeg ≥ ~2015): +```bash +pkg-config --modversion libavcodec # expect 60.x (this branch built against 60.31) +ffmpeg -hide_banner -decoders | grep -iE 'xma1|xma2' # expect: A....D xma1 / A....D xma2 +``` +(Decoder note: distro FFmpeg has **no** `AV_CODEC_ID_XMAFRAMES`; we use `AV_CODEC_ID_XMA2` — see +`crates/xenia-apu/src/xma2_codec.rs`.) On non-Debian distros install the equivalent `-dev` packages. + +### b) The game ISO (gitignored — `*.iso`) +Not in the repo. Place the Project Sylpheed ISO somewhere and create a `sylpheed.iso` symlink to it +in the repo root (the run/test commands use `sylpheed.iso`): +```bash +ln -s "/path/to/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso" sylpheed.iso +``` +⚠️ For **canary** runs, point at the REAL ISO path, not the symlink (Wine can't resolve the symlink). + +### c) Build — **always cap parallelism** (a default `-j` build OOM-crashed a 15 GB box) +```bash +export CARGO_BUILD_JOBS=4 # NEVER default -j12; check `free -h` first, drop to -j2 if <4GB free +cargo build --release +``` + +### d) Regenerate the static-analysis DB `sylpheed.db` (gitignored — `*.db`, ~586 MB, ~1h35m) +Used by the RE/analysis queries (NOT needed to run the emulator). Rebuild from the ISO: +```bash +cargo run --release -- dis "/path/to/" --db sylpheed.db +# analysis passes run in <1s; the ~1h35m is DuckDB persisting ~1.8M dispatch rows. Be patient. +``` +This branch's analyzer fix (see §3) makes the regenerated DB include the previously-missing XMV +engine vtables (`0x8200a1e8`/`0x8200a908`). A local pre-fix backup may exist as +`sylpheed.db.bak-pre-vtablefix` (gitignored, not pushed). + +--- + +## 1. WHAT'S ON THIS BRANCH (all in this one commit, on top of `acb29db` = iterate-3AL) +**Milestone-1 renderer history** (publisher/dev splash renders) is in the ancestry (iterate-2x → 3M → +3O → 3AL); pushing this branch carries it. **Milestone 2** + tooling added here: + +### ✅ XMA AUDIO path — BUILT, WORKING, deterministic, tested +- `crates/xenia-apu/src/xma.rs` — register-mapped XMA context system (MMIO `0x7FEA0000`, 320×64B + context array, Kick/Lock/Clear decode). `xma_decode.rs` + `xma2_codec.rs` — the real FFmpeg + `xma2` decoder (XMA_CONTEXT_DATA bitfields, BitStream packet parse, planar-f32→S16BE PCM). + Decode runs synchronously on the CPU thread (deterministic, no host thread). Wired via + `KernelState.xma` (`state.rs`), exports (`exports.rs`), `xaudio.rs` (`XAudioSubmitRenderDriverFrame` + made faithful), `main.rs` (MMIO install + per-round pump). +- **Audio-worker scheduler fix** (`main.rs` LR_HALT restore + `scheduler.rs`): the XAudio render + callback worker was wrongly exited after ~2 deliveries → fixed → the guest now drives XMA decode. +- Verified: real PCM out; golden `sylpheed_n50m` **re-baselined** (`crates/xenia-app/tests/golden/`) + and PASSES; milestone-1 splash intact; apu/cpu/kernel tests pass. + +### 🛠️ RE TOOLING (this branch's lasting wins) +- **Runtime dispatch-recorder** `crates/xenia-cpu/src/dispatch_rec.rs` — records `(call-site → target, + r3, lr)` for every indirect (`bcctr`-family) call. Off by default; enable with `XENIA_DISPATCH_REC=1`, + optional filters `XENIA_DISPATCH_REC_TARGETS=` / `_SITES=`, dumps to + `XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`). Deterministic, observe-only. +- **Repaired static analyzer** `crates/xenia-analysis/src/vtables.rs` — the vtable extractor silently + **fragmented vtables with non-function head slots** (missed the XMV engine vtable entirely → + blocked ~6 investigations). Fixed via **vptr-write-anchoring** (find `addis/addi → stw rX,0(rThis)` + constant-vptr installs; read the fnptr run from each anchor). Result on rebuild: vtables 722→1150, + dispatch candidates 688K→1.83M, engine fully typed. (Requires the §0d DB rebuild to take effect.) +- **Probe Heisenbug FIXED** (`main.rs run_superblock`) — `--audit-pc-probe-hex` / `--mem-watch` used to + **disable superblock chaining**, which changed thread scheduling and *starved the movie subsystem* + so the probes couldn't observe it. Now probes fire *inside* the chain loop → scheduling is identical + armed-vs-unarmed (verified byte-identical golden) → the probe suite is finally usable on the movie + subsystem. Also fixed a `--quiet` bug that swallowed armed `--trace-handles`/`--dump-addr` reports. + +--- + +## 2. CURRENT STATE & WHERE TO CONTINUE (the video still doesn't play) +**Audio works; the intro VIDEO doesn't play yet.** Root, runtime-pinned: a 2000ms readiness timeout +(`sub_821B66B8`) abandons because the XMV engine (`0x40d101c0`, runtime vtable `0x8200a1e8`) never +**primes** — engine begin-playback `sub_825076F0` (slot 21) is **never dispatched** (0×), so the +per-frame full-start always takes its skip branch and the playback clock never starts. +- **Classification: (B) guest-side state machine.** The gate fields are the engine's *correct* reset + defaults → there is **NO honest our-side fix at the gate** (forcing them = masking, forbidden). The + defect is upstream: the guest SM reaches "create decoder (success)" but never issues begin-playback. +- **Latest narrowing (evidence, fixed probes):** ARM2-setup `sub_821B55D8` runs once, create-decoder + `sub_824F8398` succeeds, and ARM2 then calls engine-setup wrappers + **`sub_824F7778` / `sub_824F7630` / `sub_824F7558` / `sub_824F7538` / `sub_824FCB68`** (on + `[movie+104]`=engine) — the begin-playback dispatch is gated **inside one of these**. Tracing them + (now possible with the fixed probes) for the begin-playback gate + why ours never satisfies it is + **the next step**. The likely ultimate unlock is **measuring canary** (same XEX reaches begin-playback) + to find the upstream state/signal we don't produce. + +Full, evidence-grounded detail (engine/vtable/slot map, the eliminations, the investigation arc, the +method lessons) lives in the agent-memory grounding file referenced in the project memory index +(`milestone2_xma_grounding`). Key anchors: engine `0x40d101c0` vtable `0x8200a1e8` — PUMP slot19 +`sub_825078D8`, begin-playback slot21 `sub_825076F0`, submit slot27 `sub_82505C08`, full-start slot40 +`sub_825061E0`; movie host `0x40bb0440` (engine at `[host+104]`); SM ARM1 `sub_821B4C98` → ARM2 +`sub_821B55D8` → ARM3 `sub_821B5FB8` → poll `sub_821B66B8`. + +### Useful commands +```bash +# Headless run to the video state (~30-40s, ~1B instr); add diagnostic flags as needed: +./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet +# Non-perturbing PC probes (now usable on the movie subsystem): +RUST_LOG=warn,xenia_apu=info XENIA_AUDIT_PC_PROBE=0x825078d8,0x82505c08 \ + ./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet +# Dispatch recorder (filtered): +XENIA_DISPATCH_REC=1 XENIA_DISPATCH_REC_TARGETS=0x825076f0,0x82505c08 \ + ./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet +# Golden / determinism check: +CARGO_BUILD_JOBS=4 cargo test -p xenia-app --release --test sylpheed_oracles -- --ignored sylpheed_n50m +# Visual (watch the splash; ASK a human to watch — never self-screenshot): +./target/release/xenia-rs exec sylpheed.iso --ui +``` +⚠️ Probe/run discipline: kill background runs by pid or `pkill -x xenia-rs` (NEVER `pkill -f`, it +self-matches the launcher). Runs are deterministic (instruction-count clock). + +🤖 Generated with [Claude Code](https://claude.com/claude-code) diff --git a/crates/xenia-analysis/src/vtables.rs b/crates/xenia-analysis/src/vtables.rs index 06e4219..35d19b0 100644 --- a/crates/xenia-analysis/src/vtables.rs +++ b/crates/xenia-analysis/src/vtables.rs @@ -26,6 +26,14 @@ use xenia_xex::pe::PeSection; use crate::demangle; +/// Maximum number of consecutive non-function slots tolerated inside an +/// anchor-recovered vtable before the run is considered terminated. MSVC +/// vtables can carry null / pure-virtual / unrecognised-thunk slots in their +/// head or interior; a small budget lets those through without merging two +/// physically-adjacent vtables. Kept small to avoid bridging the gap between +/// distinct tables. +const MAX_ANCHOR_GAP: usize = 2; + /// One detected vtable. #[derive(Debug, Clone)] pub struct Vtable { @@ -56,6 +64,35 @@ pub fn analyze( image_base: u32, sections: &[PeSection], function_starts: &std::collections::BTreeSet, +) -> Vec { + analyze_with_anchors(pe, image_base, sections, function_starts, &std::collections::BTreeSet::new()) +} + +/// Like [`analyze`], but additionally recovers vtables whose base address is +/// known a-priori from a constructor vptr-write store (an "anchor"). The +/// contiguity heuristic in pass 1 fragments any vtable whose head region +/// contains words that don't resolve to recognised function entries (null / +/// pure-virtual / unrecognised thunk slots); those vtables are never emitted +/// and the downstream typed-dispatch resolver can't type objects of that +/// class. An anchor is a *content-independent* vtable signal — the ctor +/// literally installs `vtable_base` into `this+0` via +/// `addis/addi (or lis/ori) → stw rX, 0(rThis)` — so for every anchor not +/// already covered by a pass-1 run we synthesise a vtable starting at that +/// base, reading the fnptr-array run while *tolerating* up to +/// [`MAX_ANCHOR_GAP`] consecutive non-function slots before terminating. +/// +/// `anchors` are absolute VAs of vtable bases (from +/// [`scan_vptr_write_constants`]). Existing pass-1 vtables are kept unchanged +/// (no regression): an anchor that already coincides with a detected vtable +/// base is skipped, and an anchor that lands *inside* an existing run is also +/// skipped (it's a sub-object pointer, not a fresh table). +#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))] +pub fn analyze_with_anchors( + pe: &[u8], + image_base: u32, + sections: &[PeSection], + function_starts: &std::collections::BTreeSet, + anchors: &std::collections::BTreeSet, ) -> Vec { let started = std::time::Instant::now(); // Sections we'll scan for vtable bodies. @@ -117,6 +154,120 @@ pub fn analyze( let _ = (va_start, va_end); } + // --- Anchor-driven recovery (vptr-write-anchored vtables) --- + // + // Build a coverage interval set from pass-1 runs so we don't re-emit a + // table for an anchor that already lies within an extracted vtable. + let mut covered: Vec<(u32, u32)> = candidates + .iter() + .map(|v| (v.address, v.address + v.length * 4)) + .collect(); + covered.sort_unstable(); + + let is_covered = |addr: u32, covered: &[(u32, u32)]| -> bool { + covered.iter().any(|&(s, e)| addr >= s && addr < e) + }; + + // Section lookup for "which scan target contains this VA?" + let scan_targets_va: Vec<(u32, u32, usize, usize)> = sections + .iter() + .filter(|s| matches!(s.name.as_str(), ".rdata" | ".data")) + .map(|s| { + let va = image_base + s.virtual_address; + ( + va, + va + s.virtual_size, + s.virtual_address as usize, + (s.virtual_address + s.virtual_size) as usize, + ) + }) + .collect(); + + // Cap a recovered run at the *next anchor* so two physically-adjacent + // anchored vtables don't merge. We deliberately do NOT cap at pass-1 + // fragments: a fragment is a sub-run the contiguity scan carved out of a + // larger table, and the anchor legitimately re-absorbs it (subsumed + // fragments are removed afterwards). + let anchor_bases: std::collections::BTreeSet = anchors.iter().copied().collect(); + + let mut recovered = 0usize; + let mut newly: Vec = Vec::new(); + for &anchor in anchors { + if is_covered(anchor, &covered) { continue; } + // Locate the containing .rdata/.data section. + let Some(&(va_lo, va_hi, raw_lo, raw_hi)) = + scan_targets_va.iter().find(|&&(lo, hi, _, _)| anchor >= lo && anchor < hi) + else { continue }; + if anchor % 4 != 0 { continue; } + let raw_hi = raw_hi.min(pe.len()); + // Read the fnptr-array run starting at the anchor. Tolerate small + // gaps of non-function slots (null / pure-virtual / unrecognised), + // but require the run to actually contain at least one real function + // (otherwise it's just data, not a vtable). + let next_base = anchor_bases.range((anchor + 4)..).next().copied(); + let mut methods: Vec = Vec::new(); + let mut gap = 0usize; + let mut real_fns = 0usize; + let mut off = (anchor - va_lo) as usize + raw_lo; + let mut va = anchor; + while off + 4 <= raw_hi && va < va_hi { + if let Some(nb) = next_base && va >= nb { break; } + let val = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]); + if function_starts.contains(&val) { + methods.push(val); + real_fns += 1; + gap = 0; + } else { + // A non-function slot. Keep the slot (so downstream slot + // indexing stays aligned) but count toward the gap budget. + gap += 1; + if gap > MAX_ANCHOR_GAP { + // Drop the trailing gap slots — they belong past the + // table's end. + methods.truncate(methods.len().saturating_sub(gap - 1)); + break; + } + methods.push(val); + } + off += 4; + va += 4; + } + // Trim any trailing non-function slots (the table ends at its last + // real method). + while methods.last().is_some_and(|&m| !function_starts.contains(&m)) { + methods.pop(); + } + if real_fns == 0 || methods.is_empty() { continue; } + let length = methods.len() as u32; + newly.push(Vtable { + address: anchor, + length, + col_address: None, + class_name: synth_anon_name(&methods), + rtti_present: false, + base_classes_json: None, + methods, + }); + recovered += 1; + } + if recovered > 0 { + // Drop pass-1 fragments fully subsumed by a recovered (anchored) + // vtable — the anchor base is authoritative and the fragment was a + // contiguity-scan artifact of the same table. Keep fragments that + // only partially overlap (defensive; shouldn't happen for true + // sub-runs) so we never lose method coverage. + let recovered_spans: Vec<(u32, u32)> = + newly.iter().map(|v| (v.address, v.address + v.length * 4)).collect(); + candidates.retain(|v| { + !recovered_spans + .iter() + .any(|&(s, e)| v.address >= s && v.address + v.length * 4 <= e) + }); + candidates.extend(newly); + tracing::info!(recovered, "vtables recovered from vptr-write anchors"); + } + let _ = &covered; + // RTTI walk: for each candidate, look at vtable[-1]. let pe_image_base = image_base; for v in &mut candidates { @@ -268,6 +419,98 @@ fn read_class_hierarchy( serde_json::to_string(&names).ok() } +/// Pre-pass: discover candidate vtable *bases* from constructor vptr-write +/// stores, independent of the static contiguity heuristic. A vptr install is +/// the canonical `addis/addi` (or `lis/ori`) immediate build of a constant +/// pointing into `.rdata` / `.data`, followed by `stw rX, 0(rThis)` — i.e. the +/// ctor writing the vtable pointer to `this+0`. We return the set of such +/// constants; these are fed to [`analyze_with_anchors`] so a vtable with +/// non-function head words isn't lost. +/// +/// We only consider stores at displacement 0 (the primary vptr; secondary +/// MI vptrs land at non-zero offsets and are handled by the existing +/// contiguity scan / typed-dispatch resolver well enough). The register +/// tracker mirrors the lis+addi propagation used elsewhere and is reset at +/// every basic-block boundary (`block_boundaries`). +pub fn scan_vptr_write_constants( + pe: &[u8], + image_base: u32, + functions: &std::collections::BTreeMap, // start -> (end, is_saverestore) + sections: &[PeSection], + block_boundaries: &std::collections::HashSet, +) -> std::collections::BTreeSet { + // Ranges that a vtable base may legitimately live in. + let data_ranges: Vec<(u32, u32)> = sections + .iter() + .filter(|s| matches!(s.name.as_str(), ".rdata" | ".data")) + .map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size)) + .collect(); + let in_data = |a: u32| data_ranges.iter().any(|&(s, e)| a >= s && a < e); + + const OP_ADDI: u32 = 14; + const OP_ADDIS: u32 = 15; + const OP_ORI: u32 = 24; + const OP_STW: u32 = 36; + const OP_X_FORM: u32 = 31; + + let read = |addr: u32| -> Option { + let off = addr.wrapping_sub(image_base) as usize; + if off + 4 > pe.len() { return None; } + Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]])) + }; + + let mut anchors: std::collections::BTreeSet = std::collections::BTreeSet::new(); + for (&fn_start, &(fn_end, is_saverestore)) in functions { + if is_saverestore { continue; } + let mut reg: [Option; 32] = [None; 32]; + let mut pc = fn_start; + while pc < fn_end { + if pc != fn_start && block_boundaries.contains(&pc) { + reg = [None; 32]; + } + let Some(instr) = read(pc) else { break }; + let op = instr >> 26; + let rd = ((instr >> 21) & 0x1F) as usize; + let ra = ((instr >> 16) & 0x1F) as usize; + let simm = ((instr & 0xFFFF) as i16) as i32; + let uimm = instr & 0xFFFF; + match op { + OP_ADDIS if ra == 0 => reg[rd] = Some(uimm << 16), + OP_ADDIS => reg[rd] = reg[ra].map(|b| b.wrapping_add(uimm << 16)), + OP_ADDI if ra != 0 => reg[rd] = reg[ra].map(|b| b.wrapping_add(simm as u32)), + OP_ADDI => reg[rd] = Some(simm as u32), + OP_ORI => { + let rs = rd; + reg[ra] = reg[rs].map(|b| b | uimm); + } + OP_STW => { + // `stw rS, off(rA)` with displacement 0 = primary vptr install. + if ra != 0 + && simm == 0 + && let Some(val) = reg[rd] + && in_data(val) + { + anchors.insert(val); + } + } + 32..=35 | 40..=43 | 48..=51 => reg[rd] = None, + OP_X_FORM => { + let xo = (instr >> 1) & 0x3FF; + if xo != 444 && xo != 467 { reg[rd] = None; } // keep `or`(444=mr)/`mtspr`-ish + } + 18 | 16 => { + if (instr & 1) != 0 { + for r in 0..=12 { reg[r] = None; } + } + } + _ => {} + } + pc = pc.wrapping_add(4); + } + } + anchors +} + /// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of /// the sorted method-PC list. Two vtables with identical method ordering /// collapse to the same anonymous name. @@ -385,6 +628,112 @@ mod tests { assert!(!vtables[0].rtti_present); } + #[test] + fn anchor_recovers_vtable_with_nonfn_head() { + // A vtable whose head has a null + an unrecognised word, so the + // contiguity scan (≥3 contiguous known fns) fragments it. The anchor + // (from a ctor vptr-write) must recover the whole table from its base. + let image_base = 0x82000000u32; + let rdata_va = 0x1000u32; + let text_va = 0x2000u32; + let rdata_size = 0x40u32; + let text_size = 0x100u32; + let total = (text_va + text_size) as usize; + let mut pe = vec![0u8; total]; + + let f0 = image_base + text_va; + let f1 = image_base + text_va + 0x10; + let f2 = image_base + text_va + 0x20; + // Slots: [null, NONFN(0xDEAD), f0, f1, f2] + let slots: [u32; 5] = [0, 0xDEADBEEF, f0, f1, f2]; + for (i, val) in slots.iter().enumerate() { + pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4] + .copy_from_slice(&val.to_be_bytes()); + } + + let sections = vec![ + PeSection { + name: ".rdata".into(), + virtual_address: rdata_va, + virtual_size: rdata_size, + raw_offset: rdata_va, + raw_size: rdata_size, + flags: 0x4000_0040, + }, + PeSection { + name: ".text".into(), + virtual_address: text_va, + virtual_size: text_size, + raw_offset: text_va, + raw_size: text_size, + flags: 0x6000_0020, + }, + ]; + let mut function_starts = std::collections::BTreeSet::new(); + for &pc in &[f0, f1, f2] { function_starts.insert(pc); } + + // Without an anchor: the head gap (null + nonfn = 2 slots) means the + // contiguous run is only [f0,f1,f2]=3 starting at +0x08, so pass-1 + // still finds it but at the WRONG base (0x...1008), not the true base. + let no_anchor = analyze(&pe, image_base, §ions, &function_starts); + assert!( + !no_anchor.iter().any(|v| v.address == image_base + rdata_va), + "without anchor the table is not recovered at its true base" + ); + + // With the anchor at the true base: + let mut anchors = std::collections::BTreeSet::new(); + anchors.insert(image_base + rdata_va); + let with_anchor = + analyze_with_anchors(&pe, image_base, §ions, &function_starts, &anchors); + let v = with_anchor + .iter() + .find(|v| v.address == image_base + rdata_va) + .expect("anchor must recover vtable at its true base"); + // length spans through f2 (slot 4): 5 slots. + assert_eq!(v.length, 5, "table spans null/nonfn head through last fn"); + assert_eq!(v.methods[2], f0); + assert_eq!(v.methods[4], f2); + } + + #[test] + fn scan_vptr_write_constants_finds_ctor_store() { + // Encode a ctor: addis r11,r0,0x8201; addi r11,r11,lo; stw r11,0(r31) + // installing vtable base 0x8200A908 into this+0. + let image_base = 0x82000000u32; + let ctor = 0x82001000u32; + let mut pe = vec![0u8; 0x4000]; + // Lay out a tiny .rdata at 0x...A900 so the constant lands in-range. + let vt_base = 0x8200A908u32; // 0x82010000 - 22264 + let addis = (15u32 << 26) | (11 << 21) | (0 << 16) | 0x8201; + let lo = (vt_base & 0xFFFF) as i16; // -22264 + let addi = (14u32 << 26) | (11 << 21) | (0 << 16) | ((lo as u16) as u32); + // addi r11,r0,lo would set r11=lo (sign-extended); we need addis+addi + // chained. Re-encode addis into r11 from r0, then addi r11,r11,lo. + let addi2 = (14u32 << 26) | (11 << 21) | (11 << 16) | ((lo as u16) as u32); + let stw = (36u32 << 26) | (11 << 21) | (31 << 16) | 0; // stw r11,0(r31) + let at = (ctor - image_base) as usize; + pe[at..at + 4].copy_from_slice(&addis.to_be_bytes()); + pe[at + 4..at + 8].copy_from_slice(&addi2.to_be_bytes()); + pe[at + 8..at + 12].copy_from_slice(&stw.to_be_bytes()); + let _ = addi; + + let sections = vec![PeSection { + name: ".rdata".into(), + virtual_address: 0xA900, + virtual_size: 0x200, + raw_offset: 0xA900, + raw_size: 0x200, + flags: 0x4000_0040, + }]; + let mut funcs: std::collections::BTreeMap = std::collections::BTreeMap::new(); + funcs.insert(ctor, (ctor + 0x40, false)); + let anchors = scan_vptr_write_constants( + &pe, image_base, &funcs, §ions, &std::collections::HashSet::new(), + ); + assert!(anchors.contains(&vt_base), "ctor vptr store must yield anchor {vt_base:#x}, got {anchors:?}"); + } + #[test] fn rejects_2_method_run() { let image_base = 0x82000000u32; diff --git a/crates/xenia-app/src/main.rs b/crates/xenia-app/src/main.rs index 0e2378d..4a58505 100644 --- a/crates/xenia-app/src/main.rs +++ b/crates/xenia-app/src/main.rs @@ -415,6 +415,18 @@ fn main() -> Result<()> { // metrics summary. let _obs = observability::init(&config)?; + // Env-gated indirect-dispatch recorder (off by default). Resolve the env + // once here; a scope guard dumps the recorded (call_site -> target) table + // at end-of-run no matter how the run terminates. + xenia_cpu::dispatch_rec::install(); + struct DispatchRecGuard; + impl Drop for DispatchRecGuard { + fn drop(&mut self) { + xenia_cpu::dispatch_rec::dump(); + } + } + let _dispatch_rec_guard = DispatchRecGuard; + let result = match cli.command { Commands::Disasm { path, count, at } => cmd_disasm(&path, count, at), Commands::Exec { @@ -1437,6 +1449,45 @@ fn cmd_exec_inner( // atoms that live inside `kernel.gpu.mmio`. mem.add_mmio_region(xenia_gpu::build_mmio_region(kernel.gpu.mmio())); + // apu stage 1 — reserve the 320-entry XMA context array and install the + // `0x7FEA0000` register aperture (mirrors canary's `XmaDecoder::Setup`). + // + // Physical placement: canary stores a *physical* address in + // `ContextArrayAddress` (reg 0x600) — `PhysicalHeap::GetPhysicalAddress` + // returns `va - heap_base` (== `va & 0x1FFFFFFF` for the physical heaps). + // Our memory model is FLAT: `translate_virtual` is a raw `membase + addr` + // with no separate physical-window mirror, and `translate_physical` masks + // `& 0x1FFFFFFF` — so the two only coincide for low (`< 0x2000_0000`) VAs. + // `heap_alloc` returns a `0x40000000`-region VA, so `va & 0x1FFFFFFF` would + // be 0 (disagreeing with the context pointers `XMACreateContext` hands out + // at `va + i*64`). The guest reads `ContextArrayAddress` and indexes it as + // `base + i*64`; for that to equal the pointers it dereferences, the base + // MUST equal the VA. So we advertise `va` itself — self-consistent in the + // flat model (the guest reaches every context through the same VA space). + // Stage 3's decoder will read the context structs via this VA directly + // (not via `translate_physical`). The 20480-byte buffer is page-committed + // by `heap_alloc`, so the guest never faults writing the 64-byte structs. + { + let array_size = + (xenia_apu::XMA_CONTEXT_COUNT as u32) * xenia_apu::XMA_CONTEXT_SIZE; // 320 * 64 + match kernel.heap_alloc(array_size, &mem) { + Some(va) => { + let phys = va; // flat model: array base == VA (see note above) + kernel.xma.lock().unwrap().init(va, phys); + mem.add_mmio_region(xenia_apu::build_mmio_region(kernel.xma.clone())); + tracing::info!( + va = format_args!("{va:#010x}"), + phys = format_args!("{phys:#010x}"), + size = format_args!("{array_size:#x}"), + "xma: context array reserved + 0x7FEA0000 aperture installed" + ); + } + None => { + tracing::error!("xma: failed to reserve context array (heap exhausted)"); + } + } + } + // Install the initial guest thread on HW slot 0. The thread handle we // hand the scheduler isn't visible to any guest API yet, but joiners // (XThreadWait-style) will see it via `find_by_tid`. @@ -2354,6 +2405,14 @@ fn coord_post_round( let _ = gpu_runs; } + // APU stage 3 — pump the XMA decoder on the CPU thread, same cadence as the + // inline GPU. Deterministic (no host thread / clock): for each context with + // a pending kick it runs one Work() pass, decoding the guest's XMA packets + // into PCM and writing it back into the output ring + context struct. + if let Ok(mut xma) = kernel.xma.try_lock() { + xma.decode_pending(mem); + } + if kernel.gpu.has_pending_interrupts() { for pi in kernel.gpu.take_pending_interrupts() { // Canary `ExecutePacketType3_INTERRUPT` dispatches the callback @@ -2445,7 +2504,7 @@ fn worker_prologue( stats: &mut ExecStats, ) -> PrologueOutcome { use xenia_cpu::interpreter::{step_cached, StepResult}; - use xenia_cpu::scheduler::{HwState, INITIAL_GUEST_TID}; + use xenia_cpu::scheduler::{BlockReason, HwState, INITIAL_GUEST_TID}; use xenia_cpu::PpcOpcode; const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32; @@ -2492,12 +2551,26 @@ fn worker_prologue( // 1) Halt-sentinel check (per HW thread). if pc == LR_HALT { + // iterate-4A: the async audio-callback injection (`try_inject_audio_callback`) + // sets `interrupts.saved`/`injected_ref` to the dedicated audio + // worker and runs REAL guest code (`sub_824D29F0`, which calls + // blocking kernel APIs) across MANY scheduler rounds before + // returning to `LR_HALT_SENTINEL`. The restore must fire only when + // the thread that *actually* reached the sentinel is the injected + // worker itself — i.e. the FULL `ThreadRef` (hw_id AND idx), which + // `scheduler.current` holds after `begin_slot_visit`. Matching on + // `hw_id` alone let ANY OTHER thread sharing that HW slot reach + // `LR_HALT` and consume the audio worker's `saved` slot; when the + // worker later truly returned, `saved` was already `None`, the + // guard failed, and control fell through to "marking exited" — the + // worker was removed and every subsequent audio callback dropped + // (`find_by_handle` skips Exited threads). The graphics ISR path is + // fully synchronous (`dispatch_graphics_interrupts` restores inline + // and never leaves `interrupts.saved` set across rounds), so this + // restore lifecycle is exclusive to audio and graphics is + // unaffected. let injected_here = kernel.interrupts.saved.is_some() - && kernel - .interrupts - .injected_ref - .map(|r| r.hw_id == hw_id) - == Some(true); + && kernel.interrupts.injected_ref == kernel.scheduler.current; if injected_here && let Some(saved) = kernel.interrupts.saved.take() { @@ -2509,17 +2582,64 @@ fn worker_prologue( kernel.interrupts.delivered += 1; let source = saved.source; let mut restore_outcome = "ready"; - let current = kernel.scheduler.thread(target_ref).state.clone(); - if let HwState::ServicingIrq(reason) = current { - kernel.scheduler.thread_mut(target_ref).state = - HwState::Blocked(reason); - restore_outcome = "reblocked"; + + // iterate-4A: the dedicated audio worker's canonical resting + // state is "parked on its synthetic handle, awaiting the next + // callback injection". The callback (`sub_824D29F0`) runs real + // guest code that can be flipped `ServicingIrq -> Ready` by an + // intervening `wake_ref` (a `KeSetEvent`/timeout targeting the + // worker as a waiter mid-callback). The old re-block heuristic + // only re-parked when the state was *still* `ServicingIrq`, so + // such a wake left the worker `Ready` — it then ran its thread + // entry to the `LR_HALT` sentinel, EXITED, and every subsequent + // callback dropped (`find_by_handle` skips Exited workers), + // wedging the intro-video audio→XMA pipeline. When this restore + // is an audio callback (`source == INTERRUPT_SOURCE_AUDIO`), + // re-park the worker UNCONDITIONALLY onto its synthetic + // park-handle so it survives to receive the next fire. (Graphics + // restores keep the `ServicingIrq`-only re-block: a graphics + // victim is a borrowed real thread, not a parked worker, and the + // old behavior there must stay byte-identical.) + if source == xenia_kernel::INTERRUPT_SOURCE_AUDIO { + let worker_handle = + kernel.scheduler.thread(target_ref).thread_handle; + let index = worker_handle.and_then(|h| { + kernel + .xaudio + .worker_handles + .iter() + .position(|wh| *wh == Some(h)) + }); + if let Some(index) = index { + let park = xenia_kernel::xaudio::synthetic_park_handle(index); + kernel.scheduler.thread_mut(target_ref).state = + HwState::Blocked(BlockReason::WaitAny { + handles: vec![park], + deadline: None, + }); + restore_outcome = "reparked"; + } else if let HwState::ServicingIrq(reason) = + kernel.scheduler.thread(target_ref).state.clone() + { + // Fallback (handle unresolved): preserve the legacy + // ServicingIrq-only re-block rather than leak the worker. + kernel.scheduler.thread_mut(target_ref).state = + HwState::Blocked(reason); + restore_outcome = "reblocked"; + } + } else { + let current = kernel.scheduler.thread(target_ref).state.clone(); + if let HwState::ServicingIrq(reason) = current { + kernel.scheduler.thread_mut(target_ref).state = + HwState::Blocked(reason); + restore_outcome = "reblocked"; + } } tracing::debug!( source, hw_id, outcome = restore_outcome, - "graphics interrupt: callback returned" + "interrupt: callback returned" ); return PrologueOutcome::Continue; } @@ -2905,12 +3025,55 @@ fn run_superblock( let budget = superblock_budget(); - // Probe / mem-watch / debugger-hook modes need per-block-entry - // observability; in those modes never chain (run exactly one block, - // identical to the pre-superblock behaviour). The block-cache fast - // path is only entered when hooks/DB are off anyway, but a probe or - // mem-watch can be armed alongside it. - let chain_allowed = !kernel.any_probe_active() && !mem.has_mem_watch(); + // Heisenbug fix (toolkit audit, 2026-06-21): probes and mem-watch are + // OBSERVE-ONLY diagnostics and must NOT change guest scheduling. The + // previous implementation disabled superblock chaining whenever any + // probe / mem-watch was armed (so the per-block-entry observation in + // `worker_prologue` was reached for every block). But chaining is what + // determines thread interleaving, so arming a probe perturbed the + // schedule — it starved the movie/XMV subsystem so it never reached the + // video state, making the probe useless on exactly the code we most + // needed to observe (`XENIA_SUPERBLOCK_BUDGET=1` reproduces the same + // starvation, confirming chaining is the lever). + // + // The fix fires the SAME per-block-entry observation INSIDE the chain + // loop, at every chained block's entry PC (see `fire_block_entry_probes` + // below), so chaining — and therefore scheduling — is byte-identical + // whether or not a probe is armed. `chain_allowed` no longer depends on + // the probe/mem-watch state. + // + // `wants_hooks()` (the interactive debugger / breakpoint path) still + // forces the per-instruction path in `worker_prologue` and never reaches + // `run_superblock`, so the only remaining reason to never chain here is + // the explicit budget==1 reproduction request. + let chain_allowed = budget > 1; + + // Per-block-entry diagnostic observation, replicating exactly what + // `worker_prologue` does at the first block of a slot visit: + // 1. the four `fire_*_if_match` probe helpers (read-only; each + // re-checks its own armed set against the live ctx PC), and + // 2. the mem-watch writer-context publish, so a watched store that + // fires mid-block is attributed to the CORRECT chained block's + // entry PC / LR (matching the single-block reporting granularity) + // instead of the stale superblock-entry PC. + // The closure is a pure function of the live scheduler context; the + // caller must ensure `ctx.pc` equals the block-entry PC before calling. + let probe_hw_id = wc.hw_id; + let fire_block_entry_probes = + |kernel: &mut xenia_kernel::KernelState, mem: &xenia_memory::GuestMemory| { + let hw_id = probe_hw_id; + if kernel.any_probe_active() { + kernel.fire_ctor_probe_if_match(hw_id, mem); + kernel.fire_branch_probe_if_match(hw_id); + kernel.fire_audit_pc_probe_if_match(hw_id, mem); + kernel.fire_lr_trace_if_match(hw_id); + } + if mem.has_mem_watch() { + let ctx = kernel.scheduler.ctx(hw_id); + let tid_w = kernel.scheduler.tid(hw_id).unwrap_or(0); + xenia_memory::set_writer_ctx(tid_w, ctx.pc, ctx.lr as u32); + } + }; let mut block_ptr = first_block_ptr; let mut pc_before = first_pc_before; @@ -2955,11 +3118,20 @@ fn run_superblock( break (result, block_ptr, pc_before); } - // Chain: build/fetch the next block. Re-borrows `wc.block_cache`, - // which invalidates the previous `block_ptr` — but we've already - // finished using it (only `sync_sensitive`/diagnostics were read, - // above), so the raw-pointer aliasing rule is respected. + // Chain into the next block. `ctx.pc` now equals `next_pc` (the + // chained block's entry), so fire the per-block-entry observation + // BEFORE stepping it — identical to what `worker_prologue` did at + // the first block. This keeps the probe firing at EVERY armed + // block-entry while leaving the chaining decision (and thus the + // schedule) untouched. The first block was already observed by the + // prologue, so we only observe the newly-chained blocks here. pc_before = next_pc; + fire_block_entry_probes(kernel, mem); + + // Build/fetch the next block. Re-borrows `wc.block_cache`, which + // invalidates the previous `block_ptr` — but we've already finished + // using it (only `sync_sensitive`/diagnostics were read, above), so + // the raw-pointer aliasing rule is respected. block_ptr = wc.block_cache.lookup_or_build(next_pc, mem) as *const _; }; @@ -2993,6 +3165,15 @@ fn run_execution( let mut stats = ExecStats::default(); let _ = quiet; // retained for future per-kind suppression + // APU stage 3 — give the XMA decoder a stable pointer to the guest memory + // mapping `run_execution` runs against, so the kick MMIO write can run + // Work() synchronously (canary `!use_dedicated_xma_thread` semantics: the + // game observes the updated context the instant its kick store retires). + // `mem` outlives this call for both the headless and UI paths. + if let Ok(mut xma) = kernel.xma.lock() { + xma.set_memory(mem); + } + // `--halt-on-deadlock` CLI flag OR `XENIA_HALT_ON_DEADLOCK=1|true` env var: // when the scheduler next hits a hard deadlock (every live HW thread // blocked on a handle wait with no pending timer) we bail out with a @@ -4093,10 +4274,18 @@ fn dump_thread_diagnostic( ), } } - if quiet { - return; - } use xenia_kernel::objects::KernelObject; + + // Toolkit-audit fix (2026-06-21): only the ALWAYS-ON thread/waiter table + // is suppressed by `--quiet`. The explicitly-armed diagnostics below + // (`--trace-handles`, `--trace-handles-focus`, `--dump-addr`) are + // requested output — arming the flag IS the user asking for it — and + // were previously swallowed by the blanket `if quiet { return; }`, which + // made the documented headless `--quiet` invocation silently drop every + // handle/focus/dump report. They are each self-gated below (on + // `audit.enabled` / `!audit.focus.is_empty()` / `!dump_addrs.is_empty()`) + // so they only print when actually armed. + if !quiet { println!("\n=== Thread diagnostics ==="); for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() { if slot.runqueue.is_empty() { @@ -4193,6 +4382,7 @@ fn dump_thread_diagnostic( println!(" cs={:#010x} waiters(tid)={:?}", cs_ptr, tids); } } + } // end `if !quiet` (always-on thread/waiter table) // Audit trails (only when --trace-handles flipped the flag). For each // tracked handle, emit a compact block: kind, creator, and the bounded @@ -4868,8 +5058,23 @@ fn cmd_dis( // pointer-validity oracle; runs over .rdata + .data. let function_starts: std::collections::BTreeSet = func_analysis.functions.keys().copied().collect(); - let vtables = xenia_analysis::vtables::analyze( - &pe_image, base, §ions, &function_starts, + // Anchor discovery: recover vtable bases from constructor vptr-write + // stores so a vtable with non-function head words (null / pure-virtual / + // unrecognised thunk slots) isn't fragmented away by the contiguity + // heuristic. (Fixes e.g. the XMV engine vtable 0x8200a908.) + let vptr_anchor_funcs: std::collections::BTreeMap = func_analysis + .functions + .iter() + .map(|(&s, fi)| (s, (fi.end, fi.is_saverestore))) + .collect(); + let vptr_block_boundaries: std::collections::HashSet = + xref_result.labels.keys().copied().collect(); + let vtable_anchors = xenia_analysis::vtables::scan_vptr_write_constants( + &pe_image, base, &vptr_anchor_funcs, §ions, &vptr_block_boundaries, + ); + info!(vtable_anchors = vtable_anchors.len(), "vptr-write anchor scan complete"); + let vtables = xenia_analysis::vtables::analyze_with_anchors( + &pe_image, base, §ions, &function_starts, &vtable_anchors, ); let rtti_count = vtables.iter().filter(|v| v.rtti_present).count(); info!( diff --git a/crates/xenia-app/tests/golden/sylpheed_n50m.json b/crates/xenia-app/tests/golden/sylpheed_n50m.json index 08f9714..08807bf 100644 --- a/crates/xenia-app/tests/golden/sylpheed_n50m.json +++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json @@ -1,9 +1,9 @@ { - "instructions": 50000110, - "imports": 243387, + "instructions": 50000200, + "imports": 189264, "unimpl": 0, - "draws": 1279, - "swaps": 260, + "draws": 768, + "swaps": 157, "unique_render_targets": 2, "shader_blobs_live": 6, "texture_cache_entries": 1 diff --git a/crates/xenia-apu/Cargo.toml b/crates/xenia-apu/Cargo.toml index 7da119c..da6b8fa 100644 --- a/crates/xenia-apu/Cargo.toml +++ b/crates/xenia-apu/Cargo.toml @@ -6,5 +6,12 @@ license.workspace = true [dependencies] xenia-types = { workspace = true } +xenia-memory = { workspace = true } tracing = { workspace = true } thiserror = { workspace = true } + +# Raw FFmpeg FFI for the XMA2 audio decoder (stage 3). The system libs are +# FFmpeg 6.1 (libavcodec 60), so we pin the matching `6.1` series. The `build` +# feature regenerates bindings via bindgen against the installed headers, so +# the FFI matches the distro FFmpeg exactly. We only need avcodec + avutil. +ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] } diff --git a/crates/xenia-apu/src/lib.rs b/crates/xenia-apu/src/lib.rs index 885083c..5b772cf 100644 --- a/crates/xenia-apu/src/lib.rs +++ b/crates/xenia-apu/src/lib.rs @@ -1,3 +1,9 @@ +pub mod xma; +pub mod xma2_codec; +pub mod xma_decode; + +pub use xma::{build_mmio_region, XmaDecoder, XMA_CONTEXT_COUNT, XMA_CONTEXT_SIZE}; + /// Audio processing unit stub. Logging only for now. pub struct AudioSystem { pub enabled: bool, diff --git a/crates/xenia-apu/src/xma.rs b/crates/xenia-apu/src/xma.rs new file mode 100644 index 0000000..36df998 --- /dev/null +++ b/crates/xenia-apu/src/xma.rs @@ -0,0 +1,932 @@ +//! Register-mapped XMA context system — a faithful port of xenia-canary's +//! `apu/xma_decoder.cc` context-array + MMIO machinery, MINUS the audio +//! decoder itself (stage 3). +//! +//! The guest allocates XMA contexts via `XMACreateContext` (which hands back a +//! pointer into our 320-entry context array in physical guest memory), writes +//! the 64-byte `XMA_CONTEXT_DATA` struct, then *kicks* decode by writing the +//! per-context bit into the `0x7FEA0000` register aperture. This module +//! satisfies all of that without faulting and records which contexts the guest +//! kicked; stage 3 will consume the recorded `pending` flags to actually +//! produce PCM. +//! +//! ## Byte order +//! The guest accesses the aperture byte-reversed (`stwbrx`/`lwbrx`), so the raw +//! `u32` our MMIO boundary delivers is byte-swapped relative to the logical +//! register value — exactly the situation canary handles with `xe::byte_swap`. +//! So `write_register` swaps the incoming value before decoding and the +//! register file holds host-order values; `read_register` swaps on the way out. +//! This was proven empirically: the guest's Clear writes arrive as +//! `0x01000000`/`0x02000000`/`0x04000000`, i.e. byte-reversed `1`/`2`/`4`, +//! targeting contexts 0/1/2 (which it had just allocated) — NOT 24/25/26. The +//! register-index math (`(addr & 0xFFFF) / 4`) is the same as canary's. + +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Mutex}; + +use xenia_memory::access::MemoryAccess; +use xenia_memory::{GuestMemory, MmioRegion}; + +use crate::xma_decode::{self, ContextDecodeState, XmaContextData}; + +/// Size in bytes of an `XMA_CONTEXT_DATA` struct (canary `xma_context.h`). +/// Stage 1 does not decode the fields — only the stride matters. +pub const XMA_CONTEXT_SIZE: u32 = 64; +/// Number of XMA contexts the hardware exposes (canary `kContextCount`). +pub const XMA_CONTEXT_COUNT: usize = 320; + +/// Register aperture base (guest physical). Canary maps the XMA decoder at +/// `0x7FEA0000` in `XmaDecoder::Setup`. +pub const APERTURE_BASE: u32 = 0x7FEA_0000; +/// Mask used by `MmioRegion::contains` so any `0x7FEAxxxx` address hits. +pub const APERTURE_MASK: u32 = 0xFFFF_0000; +/// Total aperture size in bytes (the low 16-bit register window). +pub const APERTURE_SIZE: u32 = 0x0001_0000; + +// ----- Register indices (canary `XmaRegister` enum / xma_register_table.inc). +// Indices are dword indices: byte offset = index * 4. + +/// `ContextArrayAddress` — physical base of the context array. byte 0x1800. +const REG_CONTEXT_ARRAY_ADDRESS: u32 = 0x600; +/// `CurrentContextIndex` — the context the HW is currently servicing. byte +/// 0x1818. Polled by the guest; we rotate it so a poll never sticks. +const REG_CURRENT_CONTEXT_INDEX: u32 = 0x606; + +/// First of the 10 `ContextNKick` registers (`Context0Kick`..`Context9Kick`). +/// byte 0x1940. Each register's bit N kicks context `base*32 + N`. +const REG_CONTEXT_KICK_BASE: u32 = 0x650; +/// First of the 10 `ContextNLock` registers. byte 0x1A40. +const REG_CONTEXT_LOCK_BASE: u32 = 0x690; +/// First of the 10 `ContextNClear` registers. byte 0x1A80. +const REG_CONTEXT_CLEAR_BASE: u32 = 0x6A0; +/// Each group spans 10 registers (320 contexts / 32-per-register). +const CONTEXT_GROUP_LEN: u32 = 10; + +/// Number of 32-bit words backing the register file. The highest index we +/// touch is `0x6A9`; round up generously so any in-aperture index is in range +/// (64 KB aperture / 4). +const REGISTER_FILE_WORDS: usize = 0x4000; + +/// Register-mapped XMA context array. Owns the allocation bitmap, the register +/// file, and the per-context kick/enable bookkeeping that stage 3 consumes. +pub struct XmaDecoder { + /// Guest virtual address of the context array (handed back by + /// `allocate_context`). + context_array_guest_va: u32, + /// Physical address stored into `ContextArrayAddress` (reg 0x600). + context_array_phys: u32, + /// 320-slot allocation bitmap, one bit per context (`bitmap[i>>6]` bit + /// `i & 63`). A set bit means *allocated*. + bitmap: [u64; (XMA_CONTEXT_COUNT + 63) / 64], + /// Flat register file, host-native values. Indexed by dword register index. + registers: Vec, + /// Per-context "decode requested" flag, set on Kick, cleared on Clear. + /// Stage 3 drains this to produce PCM. + pending: [bool; XMA_CONTEXT_COUNT], + /// Per-context enable flag. A Lock disables; a Kick (re-)enables. Mirrors + /// canary's "is_enabled" notion loosely — exact decode semantics are + /// stage 3. + enabled: [bool; XMA_CONTEXT_COUNT], + /// Total kicks observed (diagnostic; lets headless logs show progress). + kick_count: u64, + /// Rotating value served for `CurrentContextIndex` reads so a guest poll + /// can't spin forever on a fixed value. Atomic so the read path can stay + /// `&self`. + current_context_index: AtomicU32, + /// Per-context stage-3 decode state (FFmpeg codec, staged PCM frame, ring + /// bookkeeping). Lazily populated as contexts are decoded. + decode_state: Vec, + /// Total PCM bytes written to guest output buffers (diagnostic). + pcm_bytes_total: u64, + /// Stable pointer to the guest memory mapping, captured at init. Used to run + /// `Work()` SYNCHRONOUSLY inside the kick MMIO write — exactly as canary's + /// default `!use_dedicated_xma_thread` path does (`context.Work()` right in + /// `WriteRegister`), so the game sees the updated context the instant its + /// kick store retires. The mapping lives for the whole run; decode is + /// deterministic and happens on the CPU thread, so this is determinism-safe. + mem_ptr: *const GuestMemory, +} + +// The decoder is owned behind an `Arc>` and only ever touched from the +// CPU scheduler thread (kick MMIO writes + the per-round pump). The raw `mem_ptr` +// is a stable whole-run mapping; access is single-threaded. +unsafe impl Send for XmaDecoder {} + +impl XmaDecoder { + /// Construct an un-initialized decoder. Call [`Self::init`] once the + /// context-array memory has been reserved. + pub fn new() -> Self { + Self { + context_array_guest_va: 0, + context_array_phys: 0, + bitmap: [0; (XMA_CONTEXT_COUNT + 63) / 64], + registers: vec![0; REGISTER_FILE_WORDS], + pending: [false; XMA_CONTEXT_COUNT], + enabled: [false; XMA_CONTEXT_COUNT], + kick_count: 0, + current_context_index: AtomicU32::new(0), + decode_state: (0..XMA_CONTEXT_COUNT).map(|_| ContextDecodeState::new()).collect(), + pcm_bytes_total: 0, + mem_ptr: std::ptr::null(), + } + } + + /// Capture the stable guest-memory mapping so the kick MMIO path can run + /// `Work()` synchronously (canary semantics). Call once at boot, after the + /// final `mem` is in its long-lived location. + pub fn set_memory(&mut self, mem: &GuestMemory) { + self.mem_ptr = mem as *const GuestMemory; + } + + /// Wire in the context-array addresses (after the app reserves the buffer) + /// and publish the physical base into `ContextArrayAddress` (reg 0x600), + /// exactly as canary's `XmaDecoder::Setup` does. + pub fn init(&mut self, context_array_guest_va: u32, context_array_phys: u32) { + self.context_array_guest_va = context_array_guest_va; + self.context_array_phys = context_array_phys; + self.registers[REG_CONTEXT_ARRAY_ADDRESS as usize] = context_array_phys; + tracing::info!( + va = format_args!("{context_array_guest_va:#010x}"), + phys = format_args!("{context_array_phys:#010x}"), + "xma: context array initialized" + ); + } + + /// Acquire a free context slot and return its guest pointer + /// (`context_array_guest_va + i*64`), or 0 if all 320 slots are in use. + /// Mirrors canary's `XmaDecoder::AllocateContext`. + pub fn allocate_context(&mut self) -> u32 { + for i in 0..XMA_CONTEXT_COUNT { + let word = i >> 6; + let bit = 1u64 << (i & 63); + if self.bitmap[word] & bit == 0 { + self.bitmap[word] |= bit; + let ptr = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE; + tracing::info!( + index = i, + ptr = format_args!("{ptr:#010x}"), + "xma: allocate_context" + ); + return ptr; + } + } + tracing::warn!("xma: allocate_context — all {} slots in use", XMA_CONTEXT_COUNT); + 0 + } + + /// Free the slot backing `guest_ptr`. Mirrors canary's + /// `XmaDecoder::ReleaseContext`. Out-of-range / unaligned pointers are + /// ignored (the guest never faults). + pub fn release_context(&mut self, guest_ptr: u32) { + if guest_ptr < self.context_array_guest_va { + return; + } + let offset = guest_ptr - self.context_array_guest_va; + let i = (offset / XMA_CONTEXT_SIZE) as usize; + if i >= XMA_CONTEXT_COUNT { + return; + } + let word = i >> 6; + let bit = 1u64 << (i & 63); + self.bitmap[word] &= !bit; + self.pending[i] = false; + self.enabled[i] = false; + tracing::info!(index = i, ptr = format_args!("{guest_ptr:#010x}"), "xma: release_context"); + } + + /// Read a register. Returns the stored value, except `CurrentContextIndex` + /// (0x606) which rotates `0..XMA_CONTEXT_COUNT` per read so a polling guest + /// always sees forward progress. Out-of-range indices read 0. + pub fn read_register(&self, reg_index: u32) -> u32 { + // The guest accesses the aperture byte-reversed (`lwbrx`), so the + // register file holds host-order values and we swap on the way out — + // exactly as canary's `ReadRegister` returns `xe::byte_swap(reg)`. + let host = if reg_index == REG_CURRENT_CONTEXT_INDEX { + // Rotate mod context count on each read so a poll never sticks. + let prev = self.current_context_index.fetch_add(1, Ordering::Relaxed); + prev % XMA_CONTEXT_COUNT as u32 + } else { + self.registers.get(reg_index as usize).copied().unwrap_or(0) + }; + host.swap_bytes() + } + + /// Write a register, then apply the side-effect of the Kick / Lock / Clear + /// register groups. Each register in a group covers 32 contexts; bit N maps + /// to `context_id = (reg_index - group_base) * 32 + N`. We iterate set bits + /// with `trailing_zeros` + clear-lowest-bit, mirroring canary's + /// `std::countr_zero` loop. The incoming value is byte-swapped first (see + /// below). + pub fn write_register(&mut self, reg_index: u32, value: u32) { + // The guest writes the aperture byte-reversed (`stwbrx`); undo it so the + // register file holds host-order values, mirroring canary's + // `WriteRegister` which does `value = xe::byte_swap(value)` first. Proven + // by the guest's Clear writes (`0x01000000` == context 0, not 24). + let value = value.swap_bytes(); + if let Some(slot) = self.registers.get_mut(reg_index as usize) { + *slot = value; + } + + if (REG_CONTEXT_KICK_BASE..REG_CONTEXT_KICK_BASE + CONTEXT_GROUP_LEN).contains(®_index) { + let base = (reg_index - REG_CONTEXT_KICK_BASE) * 32; + let mut bits = value; + while bits != 0 { + let b = bits.trailing_zeros(); + bits &= bits - 1; + let context_id = (base + b) as usize; + if context_id < XMA_CONTEXT_COUNT { + self.pending[context_id] = true; + self.enabled[context_id] = true; + self.kick_count += 1; + tracing::debug!( + context_id, + kick_count = self.kick_count, + "xma: kick (decode requested)" + ); + // Canary `!use_dedicated_xma_thread`: run Work() right here so + // the game observes the updated context when its kick store + // retires. Safe — `mem_ptr` is a stable whole-run mapping and + // we're on the CPU thread. + if !self.mem_ptr.is_null() { + let mem: &GuestMemory = unsafe { &*self.mem_ptr }; + self.enabled[context_id] = false; + self.work_one(mem, context_id); + } + } + } + } else if (REG_CONTEXT_LOCK_BASE..REG_CONTEXT_LOCK_BASE + CONTEXT_GROUP_LEN) + .contains(®_index) + { + let base = (reg_index - REG_CONTEXT_LOCK_BASE) * 32; + let mut bits = value; + while bits != 0 { + let b = bits.trailing_zeros(); + bits &= bits - 1; + let context_id = (base + b) as usize; + if context_id < XMA_CONTEXT_COUNT { + self.enabled[context_id] = false; + tracing::debug!(context_id, "xma: lock (context disabled)"); + } + } + } else if (REG_CONTEXT_CLEAR_BASE..REG_CONTEXT_CLEAR_BASE + CONTEXT_GROUP_LEN) + .contains(®_index) + { + let base = (reg_index - REG_CONTEXT_CLEAR_BASE) * 32; + let mut bits = value; + while bits != 0 { + let b = bits.trailing_zeros(); + bits &= bits - 1; + let context_id = (base + b) as usize; + if context_id < XMA_CONTEXT_COUNT { + self.pending[context_id] = false; + self.enabled[context_id] = false; + tracing::debug!(context_id, "xma: clear (context state reset)"); + } + } + } + } + + /// Total kicks observed so far (diagnostic; stage 3 will consume `pending`). + pub fn kick_count(&self) -> u64 { + self.kick_count + } + + /// Whether context `i` has a pending (un-serviced) kick. Stage-3 hook. + pub fn is_pending(&self, i: usize) -> bool { + self.pending.get(i).copied().unwrap_or(false) + } + + /// Total PCM bytes the decoder has written to guest output buffers. + pub fn pcm_bytes_total(&self) -> u64 { + self.pcm_bytes_total + } + + /// Stage-3 entry point. Called once per scheduler round from the CPU + /// thread's per-round coordinator. For each context with a pending kick, + /// run one `Work()` pass (canary `XmaContextNew::Work`): read the context, + /// decode available input into PCM, drain into the output ring, and write + /// the decoder-owned fields back. Deterministic — no host thread, no clock. + pub fn decode_pending(&mut self, mem: &GuestMemory) { + if self.context_array_guest_va == 0 { + return; + } + for i in 0..XMA_CONTEXT_COUNT { + if !self.pending[i] || !self.enabled[i] { + continue; + } + // Canary `Work` clears is_enabled at entry; a fresh kick re-enables. + self.enabled[i] = false; + self.work_one(mem, i); + } + } + + /// One `Work()` pass for context `i`. Faithful to canary's orchestration but + /// uses the mainline xma2 decoder (whole-packet driven) for the actual + /// frame decode in place of canary's per-frame `Decode()`. + fn work_one(&mut self, mem: &GuestMemory, i: usize) { + let ctx_va = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE; + let data = XmaContextData::read(mem, ctx_va); + let initial = data; + + if data.output_buffer_valid == 0 { + return; + } + + let mut data = data; + self.decode_into_output(mem, i, ctx_va, &mut data, &initial); + } + + /// Decode available input packets into PCM and drain into the output ring. + fn decode_into_output( + &mut self, + mem: &GuestMemory, + i: usize, + ctx_va: u32, + data: &mut XmaContextData, + initial: &XmaContextData, + ) { + use xma_decode::*; + + let output_capacity = data.output_buffer_block_count * OUTPUT_BYTES_PER_BLOCK; + if output_capacity == 0 { + return; + } + let out_backing = xma_phys_to_backing(data.output_buffer_ptr); + let mut write_off = data.output_buffer_write_offset * OUTPUT_BYTES_PER_BLOCK; + let read_off = data.output_buffer_read_offset * OUTPUT_BYTES_PER_BLOCK; + + // write_count: free space in the ring from write to read. + let free_bytes = ring_write_count(read_off, write_off, output_capacity); + self.decode_state[i].remaining_subframe_blocks_in_output = + (free_bytes / OUTPUT_BYTES_PER_BLOCK) as i32; + + let effective_sdc = data.subframe_decode_count.max(1); + let min_blocks = effective_sdc as i32 + data.output_buffer_padding as i32; + + if min_blocks > self.decode_state[i].remaining_subframe_blocks_in_output { + // No room — write back unchanged and wait for the game to drain. + store_merged_pub(mem, ctx_va, data, initial); + return; + } + + let mut produced_any = false; + + // Ensure codec configured for current rate/channels. + let rate = sample_rate_hz(data.sample_rate); + let channels = if data.is_stereo != 0 { 2 } else { 1 }; + self.ensure_codec(i, rate, channels); + + // Main decode loop: while there's output ring room and valid input. + loop { + if self.decode_state[i].remaining_subframe_blocks_in_output < min_blocks { + break; + } + + // If we still have undrained subframes from a prior decode, consume + // them first (canary Consume before next Decode). + if self.decode_state[i].current_frame_remaining_subframes == 0 { + // Need a fresh decoded frame. Pull from the codec, feeding input + // packets as required. + if !self.produce_frame(mem, i, data) { + break; + } + } + + // Consume: write up to `effective_sdc` subframes (256B blocks) of + // the staged raw_frame into the output ring. + let total_subframes = + ((BYTES_PER_FRAME_CHANNEL / OUTPUT_BYTES_PER_BLOCK) << data.is_stereo) as u8; + let remaining = self.decode_state[i].current_frame_remaining_subframes; + let to_write = remaining.min(effective_sdc as u8); + let frame_read_off = (total_subframes - remaining) as usize * OUTPUT_BYTES_PER_BLOCK as usize; + let nbytes = to_write as u32 * OUTPUT_BYTES_PER_BLOCK; + + // Write into the output ring (handle wrap). + let raw = &self.decode_state[i].raw_frame; + write_off = ring_write( + mem, + out_backing, + output_capacity, + write_off, + &raw[frame_read_off..frame_read_off + nbytes as usize], + ); + self.pcm_bytes_total += nbytes as u64; + produced_any = true; + + let headroom = if remaining - to_write == 0 { + data.output_buffer_padding as i32 + } else { + 0 + }; + self.decode_state[i].remaining_subframe_blocks_in_output -= + to_write as i32 + headroom; + self.decode_state[i].current_frame_remaining_subframes -= to_write; + } + + // Writeback offsets. + data.output_buffer_write_offset = write_off / OUTPUT_BYTES_PER_BLOCK; + + if self.decode_state[i].remaining_subframe_blocks_in_output == 0 + && write_off == read_off + { + data.output_buffer_valid = 0; + } + if !produced_any && !data.is_any_input_buffer_valid() { + data.output_buffer_valid = 0; + } + + store_merged_pub(mem, ctx_va, data, initial); + } + + /// Configure (or reconfigure) the FFmpeg xma2 codec for this context. + fn ensure_codec(&mut self, i: usize, rate: u32, channels: u32) { + let st = &mut self.decode_state[i]; + if st.codec.is_some() && st.codec_rate == rate && st.codec_channels == channels { + return; + } + match crate::xma2_codec::Xma2Codec::new(rate, channels) { + Ok(c) => { + st.codec = Some(c); + st.codec_rate = rate; + st.codec_channels = channels; + tracing::info!(ctx = i, rate, channels, "xma: xma2 codec configured"); + } + Err(e) => { + tracing::error!(ctx = i, rate, channels, error = %e, "xma: xma2 codec init failed"); + st.codec = None; + } + } + } + + /// Produce one decoded 512-sample frame into `raw_frame` (interleaved S16BE). + /// + /// Input-consumption model (faithful to canary's packet/buffer contract). + /// + /// The mainline xma2 decoder consumes whole 2 KB packets via `send_packet` + /// and emits frames in bursts (internal FIFO + lookahead), so its intake + /// position can't be read per-frame. We therefore keep TWO cursors: + /// + /// 1. A private FFmpeg *feed* cursor (`feed_buffer`/`feed_packet_index`) + /// that hands raw packets to FFmpeg only far enough ahead to keep the + /// PCM queue stocked. This follows the same buffer ping-pong as the + /// guest but is NOT what the guest observes. + /// 2. The guest-visible `input_buffer_read_offset`, advanced by exactly + /// ONE compressed frame each time we emit a 512-sample frame to the + /// guest — via `advance_read_offset_one_frame`, a faithful port of the + /// offset arithmetic in canary's `Decode()`. This crosses packet and + /// buffer boundaries (and fires SwapInputBuffer, clearing the drained + /// buffer's valid bit) at canary's true per-frame cadence, which is + /// what the WMV demuxer polls to refill ADV.wmv. + /// + /// Decoupling the two means FFmpeg's whole-packet burst framing no longer + /// freezes the guest-visible offset: the offset now tracks emitted output, + /// so the input buffer is consumed and swapped as the movie actually plays. + fn produce_frame(&mut self, mem: &GuestMemory, i: usize, data: &mut XmaContextData) -> bool { + use xma_decode::*; + let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 }; + let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize; + + // Top up FFmpeg's internal FIFO (and our queue) just enough to satisfy + // one frame, feeding raw packets via the private feed cursor. + if self.decode_state[i].pcm_queue.len() < frame_bytes { + self.feed_codec(mem, i, data); + } + + // Pop exactly one 512-sample frame from the queue into raw_frame. + if self.decode_state[i].pcm_queue.len() < frame_bytes { + return false; + } + { + let st = &mut self.decode_state[i]; + st.raw_frame.iter_mut().for_each(|b| *b = 0); + for b in st.raw_frame[..frame_bytes].iter_mut() { + *b = st.pcm_queue.pop_front().unwrap(); + } + st.current_frame_remaining_subframes = (4u8) << data.is_stereo; + } + + // We just emitted one frame to the guest — advance its visible read + // offset by one compressed frame at canary's cadence (may swap buffer). + self.advance_read_offset_one_frame(mem, data); + true + } + + /// Feed raw 2 KB packets to FFmpeg from the private feed cursor until the + /// PCM queue holds at least one frame or the codec stops accepting input. + /// The feed cursor follows the guest's `current_buffer` ping-pong but keeps + /// its own packet index (`feed_packet_index`), so feeding ahead of the + /// guest-visible read offset is fine — the offset advances separately per + /// emitted frame. + fn feed_codec(&mut self, mem: &GuestMemory, i: usize, data: &XmaContextData) { + use xma_decode::*; + let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 }; + let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize; + + // Re-sync the feed buffer to the guest's current buffer if the guest has + // swapped past us (the buffer we were feeding was consumed). + if self.decode_state[i].feed_buffer != data.current_buffer + && !data.is_input_buffer_valid(self.decode_state[i].feed_buffer) + { + self.decode_state[i].feed_buffer = data.current_buffer; + self.decode_state[i].feed_packet_index = 0; + } + + const MAX_FEED: u32 = 8; + let mut fed = 0u32; + while self.decode_state[i].pcm_queue.len() < frame_bytes && fed < MAX_FEED { + let fb = self.decode_state[i].feed_buffer; + if !data.is_input_buffer_valid(fb) { + // Nothing to feed from this buffer; try the other if valid. + let other = fb ^ 1; + if data.is_input_buffer_valid(other) { + self.decode_state[i].feed_buffer = other; + self.decode_state[i].feed_packet_index = 0; + continue; + } + break; + } + let pkt_count = data.input_buffer_packet_count(fb); + let pidx = self.decode_state[i].feed_packet_index; + if pidx >= pkt_count { + // Exhausted this buffer's packets at the feed cursor; advance to + // the other buffer if it's valid (it was refilled), else wait. + let other = fb ^ 1; + if data.is_input_buffer_valid(other) { + self.decode_state[i].feed_buffer = other; + self.decode_state[i].feed_packet_index = 0; + continue; + } + break; + } + let backing = xma_phys_to_backing(data.input_buffer_address(fb)); + let pkt_va = backing + pidx * BYTES_PER_PACKET; + let mut packet = vec![0u8; BYTES_PER_PACKET as usize]; + mem.read_bytes(pkt_va, &mut packet); + let send_res = match self.decode_state[i].codec.as_mut() { + Some(codec) => codec.send_packet(&packet), + None => break, + }; + match send_res { + Ok(()) => { + self.decode_state[i].feed_packet_index += 1; + fed += 1; + self.drain_codec_frames(i); + } + // Decoder full — drain what it has and stop; re-offer this same + // packet next time (don't advance the feed cursor). + Err(ref e) if e == "EAGAIN" => { + self.drain_codec_frames(i); + break; + } + Err(e) => { + tracing::warn!(ctx = i, error = %e, "xma: send_packet failed"); + break; + } + } + } + } + + /// Pull all currently-available decoded frames from the codec and append + /// their interleaved S16BE PCM to the context's queue. + fn drain_codec_frames(&mut self, i: usize) { + loop { + let out = match self.decode_state[i].codec.as_mut() { + Some(c) => c.receive_frame(), + None => None, + }; + let Some((nb, bytes)) = out else { break }; + let st = &mut self.decode_state[i]; + st.frames_decoded += 1; + if !st.first_frame_logged { + st.first_frame_logged = true; + tracing::info!( + ctx = i, + samples = nb, + pcm_bytes = bytes.len(), + "xma: first PCM frame decoded" + ); + } + st.pcm_queue.extend(bytes); + } + } + + /// Advance `input_buffer_read_offset` by exactly ONE compressed frame, + /// faithfully mirroring the offset arithmetic in canary's + /// `XmaContextNew::Decode` (frame-size parse + packet-boundary handling + + /// SwapInputBuffer when the buffer's packets are exhausted). Called once per + /// 512-sample frame we emit to the guest, so the guest-visible read offset + /// crosses packet/buffer boundaries at canary's true cadence — independent + /// of the mainline xma2 decoder's whole-packet burst framing. This is what + /// lets `input_buffer_0_valid` toggle and the WMV demuxer refill ADV.wmv. + fn advance_read_offset_one_frame(&mut self, mem: &GuestMemory, data: &mut XmaContextData) { + use xma_decode::*; + + if !data.is_any_input_buffer_valid() { + return; + } + if !data.is_current_input_buffer_valid() { + self.swap_input_buffer(data); + if !data.is_current_input_buffer_valid() { + return; + } + } + + // Clamp a header-region offset (canary's Dirt-2 guard). + if data.input_buffer_read_offset < BITS_PER_PACKET_HEADER { + data.input_buffer_read_offset = BITS_PER_PACKET_HEADER; + } + + let pkt_count = data.current_input_buffer_packet_count(); + let input_size = pkt_count * BYTES_PER_PACKET; + let Some(packet_index) = packet_number(input_size, data.input_buffer_read_offset) else { + return; + }; + let buf_backing = xma_phys_to_backing(data.current_input_buffer_address()); + let pkt_va = buf_backing + packet_index * BYTES_PER_PACKET; + let mut packet = vec![0u8; BYTES_PER_PACKET as usize]; + mem.read_bytes(pkt_va, &mut packet); + + let first_frame_offset = packet_frame_offset(&packet); + let mut relative_offset = data.input_buffer_read_offset % BITS_PER_PACKET; + if relative_offset < first_frame_offset { + // Tail of a split frame — skip to this packet's first frame. + data.input_buffer_read_offset = + packet_index * BITS_PER_PACKET + first_frame_offset; + relative_offset = first_frame_offset; + } + + let skip_count = packet_skip_count(&packet); + // Full-packet skip (0xFF): no frames begin here — advance to the next + // packet that does, swapping the buffer if exhausted. + if skip_count == 0xFF { + let next_packet_index = packet_index + 1; + let next_off = + self.next_packet_read_offset(mem, data, next_packet_index, pkt_count); + if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER { + self.swap_input_buffer(data); + } + data.input_buffer_read_offset = next_off; + return; + } + + let info = get_packet_info(&packet, relative_offset); + let packet_to_skip = (skip_count as u32) + 1; + let next_packet_index = packet_index + packet_to_skip; + + // Frame size: clamp to the bits remaining in the packet stream (canary + // GetAmountOfBitsToRead over the (packet_index+1)*kBitsPerPacket stream). + let stream_remaining = + ((packet_index + 1) * BITS_PER_PACKET).saturating_sub(data.input_buffer_read_offset); + let frame_size = if info.current_frame_size == 0 { + // Split header we can't resolve from this packet alone; fall back to + // advancing past the rest of this packet so we don't stall. + stream_remaining + } else { + info.current_frame_size + }; + let bits_to_copy = amount_of_bits_to_read(stream_remaining, frame_size); + + if !info.is_last_frame_in_packet() { + let next_frame_offset = + (data.input_buffer_read_offset + bits_to_copy) % BITS_PER_PACKET; + data.input_buffer_read_offset = + packet_index * BITS_PER_PACKET + next_frame_offset; + return; + } + + // Last frame in this packet: move to the next packet's first frame, or + // swap the input buffer if the packets are exhausted (canary's + // `next_packet_index >= current_input_packet_count`). + let mut next_off = + self.next_packet_read_offset(mem, data, next_packet_index, pkt_count); + if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER { + self.swap_input_buffer(data); + } + if next_off == BITS_PER_PACKET_HEADER && data.is_any_input_buffer_valid() { + // At the start of the next buffer: jump to its first frame offset. + let nb_backing = xma_phys_to_backing(data.current_input_buffer_address()); + let mut hdr = [0u8; 4]; + mem.read_bytes(nb_backing, &mut hdr); + let fo = packet_frame_offset(&hdr); + if fo <= MAX_FRAME_SIZE_IN_BITS { + next_off = fo; + } + } + data.input_buffer_read_offset = next_off; + } + + /// Scan forward from `next_packet_index` (possibly into the *next* buffer) + /// for the next packet that begins a frame and return its bit offset, or + /// `BITS_PER_PACKET_HEADER` if none (canary `GetNextPacketReadOffset`). + fn next_packet_read_offset( + &self, + mem: &GuestMemory, + data: &XmaContextData, + next_packet_index: u32, + current_input_packet_count: u32, + ) -> u32 { + use xma_decode::*; + // Resolve which buffer the packet lives in (current or the other). + let (buffer_index, mut pidx) = if next_packet_index >= current_input_packet_count { + (data.current_buffer ^ 1, next_packet_index - current_input_packet_count) + } else { + (data.current_buffer, next_packet_index) + }; + if !data.is_input_buffer_valid(buffer_index) { + return BITS_PER_PACKET_HEADER; + } + let addr = data.input_buffer_address(buffer_index); + if addr == 0 { + return BITS_PER_PACKET_HEADER; + } + let pkt_count = data.input_buffer_packet_count(buffer_index); + let backing = xma_phys_to_backing(addr); + while pidx < pkt_count { + let mut hdr = [0u8; 4]; + mem.read_bytes(backing + pidx * BYTES_PER_PACKET, &mut hdr); + let fo = packet_frame_offset(&hdr); + if fo <= MAX_FRAME_SIZE_IN_BITS { + return pidx * BITS_PER_PACKET + fo; + } + pidx += 1; + } + BITS_PER_PACKET_HEADER + } + + fn swap_input_buffer(&mut self, data: &mut XmaContextData) { + use xma_decode::*; + tracing::debug!( + from = data.current_buffer, + to = data.current_buffer ^ 1, + "xma: SwapInputBuffer (input buffer consumed)" + ); + if data.current_buffer == 0 { + data.input_buffer_0_valid = 0; + } else { + data.input_buffer_1_valid = 0; + } + data.current_buffer ^= 1; + data.input_buffer_read_offset = BITS_PER_PACKET_HEADER; + } +} + +impl Default for XmaDecoder { + fn default() -> Self { + Self::new() + } +} + +/// Build the [`MmioRegion`] for the XMA register aperture at `0x7FEA0000`. +/// Mirrors the GPU's `build_region`: the closures lock the shared decoder, +/// compute the dword register index, and dispatch to `read`/`write_register`. +pub fn build_mmio_region(dec: Arc>) -> MmioRegion { + let read_dec = dec.clone(); + let write_dec = dec; + + MmioRegion { + base_address: APERTURE_BASE, + mask: APERTURE_MASK, + size: APERTURE_SIZE, + read_callback: Box::new(move |addr: u32| { + let reg_index = (addr & 0xFFFF) / 4; + read_dec.lock().unwrap().read_register(reg_index) + }), + write_callback: Box::new(move |addr: u32, value: u32| { + let reg_index = (addr & 0xFFFF) / 4; + write_dec.lock().unwrap().write_register(reg_index, value); + }), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn inited() -> XmaDecoder { + let mut d = XmaDecoder::new(); + // Pick a plausible physical-window VA/phys pair. + d.init(0xA010_0000, 0x0010_0000); + d + } + + /// The guest writes/reads the aperture byte-reversed; `wire(v)` is the raw + /// bus value the guest sends to mean host-order `v` (and what a read of a + /// host-order `v` returns). Equivalent to `lwbrx`/`stwbrx` semantics. + fn wire(v: u32) -> u32 { + v.swap_bytes() + } + + /// (a) `allocate_context` hands back distinct, increasing pointers spaced by + /// the 64-byte stride, exhausts at 320, and `release_context` frees the slot. + #[test] + fn allocate_distinct_then_exhaust_then_release() { + let mut d = inited(); + let first = d.allocate_context(); + let second = d.allocate_context(); + assert_eq!(first, 0xA010_0000); + assert_eq!(second, 0xA010_0000 + XMA_CONTEXT_SIZE); + assert!(second > first); + + // Drain the remaining slots (2 already taken). + for _ in 0..(XMA_CONTEXT_COUNT - 2) { + assert_ne!(d.allocate_context(), 0); + } + // 321st allocation fails. + assert_eq!(d.allocate_context(), 0); + + // Free the first slot and re-acquire it. + d.release_context(first); + assert_eq!(d.allocate_context(), first); + } + + /// (b) A Kick to `Context0Kick` with host value `0b101` marks contexts 0 + /// and 2. The guest sends it byte-reversed (`wire`). + #[test] + fn kick_context0_marks_correct_contexts() { + let mut d = inited(); + d.write_register(REG_CONTEXT_KICK_BASE, wire(0b101)); + assert!(d.is_pending(0)); + assert!(!d.is_pending(1)); + assert!(d.is_pending(2)); + assert_eq!(d.kick_count(), 2); + } + + /// (c) A Kick to `Context1Kick` (0x651) bit 0 maps to context_id 32. + #[test] + fn kick_context1_bit0_is_context_32() { + let mut d = inited(); + d.write_register(REG_CONTEXT_KICK_BASE + 1, wire(0b1)); + assert!(d.is_pending(32)); + assert!(!d.is_pending(0)); + assert_eq!(d.kick_count(), 1); + } + + /// Regression for the byte-order fix: the guest's real Clear writes were + /// `0x01000000`/`0x02000000`/`0x04000000` (bytes-reversed `1`/`2`/`4`), + /// meaning contexts 0/1/2 — NOT 24/25/26. Verify the raw bus values decode + /// to the low contexts. + #[test] + fn byte_reversed_clear_targets_low_contexts() { + let mut d = inited(); + for i in 0..3 { + d.write_register(REG_CONTEXT_KICK_BASE, wire(1 << i)); + } + assert!(d.is_pending(0) && d.is_pending(1) && d.is_pending(2)); + // The exact bus values observed from the guest. + d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0100_0000); + d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0200_0000); + d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0400_0000); + assert!(!d.is_pending(0) && !d.is_pending(1) && !d.is_pending(2)); + } + + /// (d) `read_register(0x600)` returns the base byte-reversed (the guest + /// `lwbrx`-reverses it back to the host-order base on its side). + #[test] + fn context_array_address_reads_phys() { + let d = inited(); + assert_eq!( + d.read_register(REG_CONTEXT_ARRAY_ADDRESS), + wire(0x0010_0000) + ); + } + + /// (e) `CurrentContextIndex` rotates on each read and wraps at the count + /// (values returned byte-reversed). + #[test] + fn current_context_index_rotates() { + let d = inited(); + assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0)); + assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(1)); + assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(2)); + // Advance to the wrap boundary. + for _ in 3..XMA_CONTEXT_COUNT as u32 { + d.read_register(REG_CURRENT_CONTEXT_INDEX); + } + // Next read wraps back to 0. + assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0)); + } + + /// Clear must drop a previously-kicked pending flag. + #[test] + fn clear_resets_pending() { + let mut d = inited(); + d.write_register(REG_CONTEXT_KICK_BASE, wire(0b1)); + assert!(d.is_pending(0)); + d.write_register(REG_CONTEXT_CLEAR_BASE, wire(0b1)); + assert!(!d.is_pending(0)); + } + + /// The MMIO region routes a guest write at `BASE + 0x600*4` to reg 0x600 + /// and a read back through the same byte address, applying the byte swap. + #[test] + fn mmio_region_round_trips_register() { + let dec = Arc::new(Mutex::new(inited())); + let region = build_mmio_region(dec.clone()); + let kick_byte = APERTURE_BASE + REG_CONTEXT_KICK_BASE * 4; + (region.write_callback)(kick_byte, wire(0b1)); + assert!(dec.lock().unwrap().is_pending(0)); + // ContextArrayAddress read-back via the bus (byte-reversed). + let addr_byte = APERTURE_BASE + REG_CONTEXT_ARRAY_ADDRESS * 4; + assert_eq!((region.read_callback)(addr_byte), wire(0x0010_0000)); + } +} diff --git a/crates/xenia-apu/src/xma2_codec.rs b/crates/xenia-apu/src/xma2_codec.rs new file mode 100644 index 0000000..ad93b2c --- /dev/null +++ b/crates/xenia-apu/src/xma2_codec.rs @@ -0,0 +1,217 @@ +//! Thin unsafe wrapper around the mainline FFmpeg `AV_CODEC_ID_XMA2` decoder. +//! +//! Unlike canary's vendored `XMAFRAMES` (one frame per packet, custom padding +//! header), the distro xma2 decoder consumes whole 2 KB XMA2 packets +//! (`block_align == 2048`), needs `extradata` declaring the channel/stream +//! layout, and buffers samples internally across packets. We drive it with the +//! guest's raw 2 KB packets and pull whatever 512-sample float-planar frames it +//! emits, returning them as interleaved S16 big-endian PCM (canary `ConvertFrame`). + +use std::os::raw::c_int; +use std::ptr; + +use ffmpeg_sys_next as ff; + +/// One xma2 decoder instance, configured for a fixed (sample_rate, channels). +pub struct Xma2Codec { + codec: *const ff::AVCodec, + ctx: *mut ff::AVCodecContext, + frame: *mut ff::AVFrame, + packet: *mut ff::AVPacket, + extradata: Vec, + channels: u32, +} + +// FFmpeg objects are not Send/Sync by default; the decoder is only ever touched +// on the CPU scheduler thread (decode_pending), so this is sound for our use. +unsafe impl Send for Xma2Codec {} + +impl Xma2Codec { + /// Build XMA2WAVEFORMATEX extradata (34 bytes) for a single XMA2 stream. + /// Layout (little-endian, per FFmpeg `xma_decode_init` / xma2defs.h): + /// [0..2] NumStreams (u16) = 1 + /// [2..6] ChannelMask (u32) = mono/stereo mask + /// [6..34] remaining XMA2WAVEFORMATEX fields (unused by the decoder) + fn build_extradata(channels: u32) -> Vec { + let mut e = vec![0u8; 34]; + // NumStreams = 1 + e[0..2].copy_from_slice(&1u16.to_le_bytes()); + // ChannelMask: 0x3 (FL|FR) for stereo, 0x4 (FC) for mono. + let mask: u32 = if channels >= 2 { 0x3 } else { 0x4 }; + e[2..6].copy_from_slice(&mask.to_le_bytes()); + e + } + + pub fn new(sample_rate: u32, channels: u32) -> Result { + unsafe { + let codec = ff::avcodec_find_decoder(ff::AVCodecID::AV_CODEC_ID_XMA2); + if codec.is_null() { + return Err("xma2 decoder not found in libavcodec".into()); + } + let ctx = ff::avcodec_alloc_context3(codec); + if ctx.is_null() { + return Err("avcodec_alloc_context3 failed".into()); + } + + let mut extradata = Self::build_extradata(channels); + // FFmpeg requires extradata to be allocated with av_malloc and + // padded; copy our bytes into an av_malloc'd buffer. + let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize; + let raw = ff::av_mallocz(extradata.len() + pad) as *mut u8; + if raw.is_null() { + ff::avcodec_free_context(&mut (ctx as *mut _)); + return Err("av_mallocz extradata failed".into()); + } + ptr::copy_nonoverlapping(extradata.as_ptr(), raw, extradata.len()); + (*ctx).extradata = raw; + (*ctx).extradata_size = extradata.len() as c_int; + + (*ctx).sample_rate = sample_rate as c_int; + (*ctx).block_align = 2048; + ff::av_channel_layout_default(&mut (*ctx).ch_layout, channels as c_int); + + let ret = ff::avcodec_open2(ctx, codec, ptr::null_mut()); + if ret < 0 { + let mut ctxm = ctx; + ff::avcodec_free_context(&mut ctxm); + return Err(format!("avcodec_open2 failed: {}", av_err(ret))); + } + + let frame = ff::av_frame_alloc(); + let packet = ff::av_packet_alloc(); + if frame.is_null() || packet.is_null() { + let mut ctxm = ctx; + ff::avcodec_free_context(&mut ctxm); + return Err("av_frame_alloc/av_packet_alloc failed".into()); + } + + // keep our Vec alive as the source of truth for length + extradata.shrink_to_fit(); + + Ok(Self { + codec, + ctx, + frame, + packet, + extradata, + channels, + }) + } + } + + pub fn channels(&self) -> u32 { + self.channels + } + + /// Feed one raw 2 KB XMA2 packet (header + data) to the decoder. Returns the + /// number of bytes the decoder accepted (0 = buffered, needs no new packet + /// yet / EAGAIN). Decoded frames are pulled via [`receive_frame`]. + pub fn send_packet(&mut self, packet: &[u8]) -> Result<(), String> { + unsafe { + // av_packet_from_data takes ownership of an av_malloc buffer; simpler + // to point at our own bytes via a stack packet with a padded copy. + let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize; + let buf = ff::av_malloc(packet.len() + pad) as *mut u8; + if buf.is_null() { + return Err("av_malloc packet failed".into()); + } + ptr::copy_nonoverlapping(packet.as_ptr(), buf, packet.len()); + ptr::write_bytes(buf.add(packet.len()), 0, pad); + ff::av_packet_unref(self.packet); + // Wrap buf so FFmpeg frees it. + let ret = ff::av_packet_from_data(self.packet, buf, packet.len() as c_int); + if ret < 0 { + ff::av_free(buf as *mut _); + return Err(format!("av_packet_from_data failed: {}", av_err(ret))); + } + let ret = ff::avcodec_send_packet(self.ctx, self.packet); + if ret == ff::AVERROR(ff::EAGAIN) { + // Decoder full — caller should drain frames first then retry. + return Err("EAGAIN".into()); + } + if ret < 0 { + return Err(format!("avcodec_send_packet failed: {}", av_err(ret))); + } + Ok(()) + } + } + + /// Signal end-of-stream so the decoder flushes its internal FIFO. + pub fn send_eof(&mut self) { + unsafe { + let _ = ff::avcodec_send_packet(self.ctx, ptr::null()); + } + } + + /// Pull one decoded frame as interleaved S16 big-endian PCM, or None if the + /// decoder needs more input (EAGAIN) or is drained (EOF). Returns + /// (samples_per_channel, interleaved_s16be_bytes). + pub fn receive_frame(&mut self) -> Option<(u32, Vec)> { + unsafe { + let ret = ff::avcodec_receive_frame(self.ctx, self.frame); + if ret < 0 { + return None; + } + let nb = (*self.frame).nb_samples as u32; + if nb == 0 { + return None; + } + let ch = (*self.frame).ch_layout.nb_channels.max(1) as u32; + let out = convert_frame_planar_to_s16be(self.frame, ch, nb); + Some((nb, out)) + } + } +} + +impl Drop for Xma2Codec { + fn drop(&mut self) { + unsafe { + if !self.frame.is_null() { + ff::av_frame_free(&mut self.frame); + } + if !self.packet.is_null() { + ff::av_packet_free(&mut self.packet); + } + if !self.ctx.is_null() { + ff::avcodec_free_context(&mut self.ctx); + } + let _ = &self.codec; + let _ = &self.extradata; + } + } +} + +/// Convert FFmpeg planar-float output to interleaved S16 big-endian PCM +/// (faithful to canary `XmaContext::ConvertFrame`: saturate to [-1,1], scale by +/// 2^15-1, byte-swap each sample). `channels` planes of `nb_samples` floats. +unsafe fn convert_frame_planar_to_s16be( + frame: *mut ff::AVFrame, + channels: u32, + nb_samples: u32, +) -> Vec { + const SCALE: f32 = ((1i32 << 15) - 1) as f32; + let mut out = Vec::with_capacity((nb_samples * channels * 2) as usize); + unsafe { + // extended_data[ch] points to a plane of f32 (AV_SAMPLE_FMT_FLTP). + let ext = (*frame).extended_data; + for i in 0..nb_samples as isize { + for ch in 0..channels as isize { + let plane = *ext.offset(ch) as *const f32; + let s = if plane.is_null() { 0.0 } else { *plane.offset(i) }; + let clamped = s.clamp(-1.0, 1.0) * SCALE; + let v = clamped as i16; + out.extend_from_slice(&v.to_be_bytes()); + } + } + } + out +} + +fn av_err(code: c_int) -> String { + unsafe { + let mut buf = [0i8; ff::AV_ERROR_MAX_STRING_SIZE as usize]; + ff::av_strerror(code, buf.as_mut_ptr(), buf.len()); + let cstr = std::ffi::CStr::from_ptr(buf.as_ptr()); + cstr.to_string_lossy().into_owned() + } +} diff --git a/crates/xenia-apu/src/xma_decode.rs b/crates/xenia-apu/src/xma_decode.rs new file mode 100644 index 0000000..aa839ea --- /dev/null +++ b/crates/xenia-apu/src/xma_decode.rs @@ -0,0 +1,690 @@ +//! Stage 3 — the real XMA2→PCM decoder. +//! +//! A faithful port of xenia-canary's `apu/xma_context_new.cc` decode pipeline +//! (`Work`/`Decode`/`Consume`/`StoreContextMerged`), adapted to the *mainline* +//! distro FFmpeg `AV_CODEC_ID_XMA2` decoder rather than canary's vendored +//! `AV_CODEC_ID_XMAFRAMES`. +//! +//! ## Determinism +//! There is no host decoder thread. [`super::xma::XmaDecoder::decode_pending`] +//! is invoked from the CPU scheduler's per-round coordinator +//! (`coord_post_round` in xenia-app). FFmpeg decode is itself deterministic +//! (same input bytes → same PCM), so the lockstep golden stays reproducible. +//! +//! ## FFmpeg framing — why this differs from canary +//! Canary feeds FFmpeg one *frame* at a time (it bit-extracts a single 512- +//! sample frame from the guest packet stream and hands it to the vendored +//! `XMAFRAMES` codec with a custom 1-byte padding header). The mainline +//! `xma2` decoder does NOT have `XMAFRAMES`; instead it consumes whole 2 KB +//! XMA2 *packets* (`block_align == 2048`), needs `extradata` declaring the +//! stream/channel layout, and manages frame splitting + a per-stream sample +//! FIFO internally. So this module keeps canary's *guest-facing* contract +//! (the `XMA_CONTEXT_DATA` packet/frame bookkeeping, the 256-byte-block output +//! ring buffer, the field writeback) but replaces canary's per-frame +//! `Decode()` body with: feed the current 2 KB packet to the xma2 decoder, +//! pull any 512-sample PCM frames it emits, convert them to interleaved S16BE, +//! and stage them as the "raw frame" that `Consume()` drains into the output +//! ring. +//! +//! See `xma2_codec.rs` for the unsafe FFmpeg wrapper. + +use std::collections::VecDeque; + +use xenia_memory::access::MemoryAccess; +use xenia_memory::GuestMemory; + +use crate::xma2_codec::Xma2Codec; + +// ---- Constants (canary `XmaContext` / `XmaContextNew`). + +pub const BYTES_PER_PACKET: u32 = 2048; +pub const BYTES_PER_PACKET_HEADER: u32 = 4; +pub const BYTES_PER_PACKET_DATA: u32 = BYTES_PER_PACKET - BYTES_PER_PACKET_HEADER; +pub const BITS_PER_PACKET: u32 = BYTES_PER_PACKET * 8; +/// Canary `kBitsPerPacketHeader` (in the *new* context) is 32. +pub const BITS_PER_PACKET_HEADER: u32 = 32; +pub const BITS_PER_FRAME_HEADER: u32 = 15; + +pub const SAMPLES_PER_FRAME: u32 = 512; +pub const BYTES_PER_SAMPLE: u32 = 2; +pub const BYTES_PER_FRAME_CHANNEL: u32 = SAMPLES_PER_FRAME * BYTES_PER_SAMPLE; // 1024 +pub const OUTPUT_BYTES_PER_BLOCK: u32 = 256; +pub const OUTPUT_MAX_SIZE_BYTES: u32 = 31 * OUTPUT_BYTES_PER_BLOCK; + +pub const MAX_FRAME_LENGTH: u32 = 0x7FFF; +pub const MAX_FRAME_SIZE_IN_BITS: u32 = 0x4000 - BITS_PER_PACKET_HEADER; + +const ID_TO_SAMPLE_RATE: [u32; 4] = [24000, 32000, 44100, 48000]; + +/// Project a bare-physical XMA buffer pointer (`0x0xxxxxxx`) to the host-backed +/// guest VA used by the rest of the emulator. Identical formula to +/// `xenia_gpu::physical_to_backing` for the physical window; the input/output +/// buffer pointers in the context are always in the low physical window. +#[inline] +pub fn xma_phys_to_backing(p: u32) -> u32 { + 0x4000_0000 | (p & 0x1FFF_FFFF) +} + +// ---- XMA_CONTEXT_DATA (canary `xma_context.h`, 64 bytes, 16 dwords). +// +// Stored big-endian in guest memory. We load all 16 dwords (BE) and unpack the +// bitfields exactly per the canary layout (bitfields pack LSB-first within each +// host-order dword). All fields below are kept as plain integers. + +#[derive(Clone, Copy, Debug, Default)] +pub struct XmaContextData { + // DWORD 0 + pub input_buffer_0_packet_count: u32, // :12 + pub loop_count: u32, // :8 + pub input_buffer_0_valid: u32, // :1 + pub input_buffer_1_valid: u32, // :1 + pub output_buffer_block_count: u32, // :5 + pub output_buffer_write_offset: u32, // :5 + // DWORD 1 + pub input_buffer_1_packet_count: u32, // :12 + pub loop_subframe_start: u32, // :2 + pub loop_subframe_end: u32, // :3 + pub loop_subframe_skip: u32, // :3 + pub subframe_decode_count: u32, // :4 + pub output_buffer_padding: u32, // :3 + pub sample_rate: u32, // :2 + pub is_stereo: u32, // :1 + pub unk_dword_1_c: u32, // :1 + pub output_buffer_valid: u32, // :1 + // DWORD 2 + pub input_buffer_read_offset: u32, // :26 + pub error_status: u32, // :5 + pub error_set: u32, // :1 + // DWORD 3 + pub loop_start: u32, // :26 + pub parser_error_status: u32, // :5 + pub parser_error_set: u32, // :1 + // DWORD 4 + pub loop_end: u32, // :26 + pub packet_metadata: u32, // :5 + pub current_buffer: u32, // :1 + // DWORD 5..8 + pub input_buffer_0_ptr: u32, + pub input_buffer_1_ptr: u32, + pub output_buffer_ptr: u32, + pub work_buffer_ptr: u32, + // DWORD 9 + pub output_buffer_read_offset: u32, // :5 + pub stop_when_done: u32, // :1 (bit 30) + pub interrupt_when_done: u32, // :1 (bit 31) +} + +#[inline] +fn bits(v: u32, shift: u32, width: u32) -> u32 { + (v >> shift) & ((1u32 << width) - 1) +} + +impl XmaContextData { + /// Read the 64-byte context struct from guest VA `ctx_va` (already a VA, + /// not a physical ptr). Each dword is read big-endian via `read_u32`. + pub fn read(mem: &GuestMemory, ctx_va: u32) -> Self { + let mut d = [0u32; 16]; + for (i, w) in d.iter_mut().enumerate() { + *w = mem.read_u32(ctx_va + (i as u32) * 4); + } + let mut c = Self::default(); + // DWORD 0 + c.input_buffer_0_packet_count = bits(d[0], 0, 12); + c.loop_count = bits(d[0], 12, 8); + c.input_buffer_0_valid = bits(d[0], 20, 1); + c.input_buffer_1_valid = bits(d[0], 21, 1); + c.output_buffer_block_count = bits(d[0], 22, 5); + c.output_buffer_write_offset = bits(d[0], 27, 5); + // DWORD 1 + c.input_buffer_1_packet_count = bits(d[1], 0, 12); + c.loop_subframe_start = bits(d[1], 12, 2); + c.loop_subframe_end = bits(d[1], 14, 3); + c.loop_subframe_skip = bits(d[1], 17, 3); + c.subframe_decode_count = bits(d[1], 20, 4); + c.output_buffer_padding = bits(d[1], 24, 3); + c.sample_rate = bits(d[1], 27, 2); + c.is_stereo = bits(d[1], 29, 1); + c.unk_dword_1_c = bits(d[1], 30, 1); + c.output_buffer_valid = bits(d[1], 31, 1); + // DWORD 2 + c.input_buffer_read_offset = bits(d[2], 0, 26); + c.error_status = bits(d[2], 26, 5); + c.error_set = bits(d[2], 31, 1); + // DWORD 3 + c.loop_start = bits(d[3], 0, 26); + c.parser_error_status = bits(d[3], 26, 5); + c.parser_error_set = bits(d[3], 31, 1); + // DWORD 4 + c.loop_end = bits(d[4], 0, 26); + c.packet_metadata = bits(d[4], 26, 5); + c.current_buffer = bits(d[4], 31, 1); + // DWORD 5..8 + c.input_buffer_0_ptr = d[5]; + c.input_buffer_1_ptr = d[6]; + c.output_buffer_ptr = d[7]; + c.work_buffer_ptr = d[8]; + // DWORD 9 + c.output_buffer_read_offset = bits(d[9], 0, 5); + c.stop_when_done = bits(d[9], 30, 1); + c.interrupt_when_done = bits(d[9], 31, 1); + c + } + + /// Repack the bitfields back into the 16 dwords (host order). Only the + /// decoder-owned fields differ from what was read; callers use + /// [`store_merged`] to write back without clobbering game-owned fields. + fn pack(&self) -> [u32; 16] { + let mut d = [0u32; 16]; + d[0] = (self.input_buffer_0_packet_count & 0xFFF) + | ((self.loop_count & 0xFF) << 12) + | ((self.input_buffer_0_valid & 1) << 20) + | ((self.input_buffer_1_valid & 1) << 21) + | ((self.output_buffer_block_count & 0x1F) << 22) + | ((self.output_buffer_write_offset & 0x1F) << 27); + d[1] = (self.input_buffer_1_packet_count & 0xFFF) + | ((self.loop_subframe_start & 0x3) << 12) + | ((self.loop_subframe_end & 0x7) << 14) + | ((self.loop_subframe_skip & 0x7) << 17) + | ((self.subframe_decode_count & 0xF) << 20) + | ((self.output_buffer_padding & 0x7) << 24) + | ((self.sample_rate & 0x3) << 27) + | ((self.is_stereo & 1) << 29) + | ((self.unk_dword_1_c & 1) << 30) + | ((self.output_buffer_valid & 1) << 31); + d[2] = (self.input_buffer_read_offset & 0x3FF_FFFF) + | ((self.error_status & 0x1F) << 26) + | ((self.error_set & 1) << 31); + d[3] = (self.loop_start & 0x3FF_FFFF) + | ((self.parser_error_status & 0x1F) << 26) + | ((self.parser_error_set & 1) << 31); + d[4] = (self.loop_end & 0x3FF_FFFF) + | ((self.packet_metadata & 0x1F) << 26) + | ((self.current_buffer & 1) << 31); + d[5] = self.input_buffer_0_ptr; + d[6] = self.input_buffer_1_ptr; + d[7] = self.output_buffer_ptr; + d[8] = self.work_buffer_ptr; + d[9] = (self.output_buffer_read_offset & 0x1F) + | ((self.stop_when_done & 1) << 30) + | ((self.interrupt_when_done & 1) << 31); + d + } + + pub fn is_input_buffer_valid(&self, idx: u32) -> bool { + if idx == 0 { + self.input_buffer_0_valid != 0 + } else { + self.input_buffer_1_valid != 0 + } + } + pub fn is_current_input_buffer_valid(&self) -> bool { + self.is_input_buffer_valid(self.current_buffer) + } + pub fn is_any_input_buffer_valid(&self) -> bool { + self.input_buffer_0_valid != 0 || self.input_buffer_1_valid != 0 + } + pub fn input_buffer_address(&self, idx: u32) -> u32 { + if idx == 0 { + self.input_buffer_0_ptr + } else { + self.input_buffer_1_ptr + } + } + pub fn current_input_buffer_address(&self) -> u32 { + self.input_buffer_address(self.current_buffer) + } + pub fn input_buffer_packet_count(&self, idx: u32) -> u32 { + if idx == 0 { + self.input_buffer_0_packet_count + } else { + self.input_buffer_1_packet_count + } + } + pub fn current_input_buffer_packet_count(&self) -> u32 { + self.input_buffer_packet_count(self.current_buffer) + } +} + +/// Merge decoder-owned fields back into guest memory (canary `StoreContextMerged`). +/// Re-reads the current context (game may have raced an update), overwrites only +/// the fields the decoder owns, and writes all 16 dwords back BE. +fn store_merged( + mem: &GuestMemory, + ctx_va: u32, + data: &XmaContextData, + initial: &XmaContextData, +) { + let mut fresh = XmaContextData::read(mem, ctx_va); + // DWORD 0 + fresh.loop_count = data.loop_count; + fresh.output_buffer_write_offset = data.output_buffer_write_offset; + if initial.input_buffer_0_valid != 0 && data.input_buffer_0_valid == 0 { + fresh.input_buffer_0_valid = 0; + } + if initial.input_buffer_1_valid != 0 && data.input_buffer_1_valid == 0 { + fresh.input_buffer_1_valid = 0; + } + // DWORD 1 + if initial.output_buffer_valid != 0 && data.output_buffer_valid == 0 { + fresh.output_buffer_valid = 0; + } + // DWORD 2 + fresh.input_buffer_read_offset = data.input_buffer_read_offset; + fresh.error_status = data.error_status; + // DWORD 4 + fresh.current_buffer = data.current_buffer; + // DWORD 9 + fresh.output_buffer_read_offset = data.output_buffer_read_offset; + + let d = fresh.pack(); + for (i, w) in d.iter().enumerate() { + mem.write_u32(ctx_va + (i as u32) * 4, *w); + } +} + +/// Public wrapper for [`store_merged`] (called from the orchestrator in xma.rs). +pub fn store_merged_pub( + mem: &GuestMemory, + ctx_va: u32, + data: &XmaContextData, + initial: &XmaContextData, +) { + store_merged(mem, ctx_va, data, initial); +} + +/// Free byte count in a ring buffer from `write_off` to `read_off` +/// (canary `RingBuffer::write_count`). +pub fn ring_write_count(read_off: u32, write_off: u32, capacity: u32) -> u32 { + if read_off == write_off { + capacity + } else if write_off < read_off { + read_off - write_off + } else { + (capacity - write_off) + read_off + } +} + +/// Write `bytes` into the guest ring buffer at `backing + write_off`, wrapping +/// at `capacity`. Returns the new write offset (canary `RingBuffer::Write`). +pub fn ring_write( + mem: &GuestMemory, + backing: u32, + capacity: u32, + write_off: u32, + bytes: &[u8], +) -> u32 { + let count = (bytes.len() as u32).min(capacity); + if count == 0 { + return write_off; + } + if write_off + count < capacity { + mem.write_bytes(backing + write_off, &bytes[..count as usize]); + write_off + count + } else { + let left = capacity - write_off; + mem.write_bytes(backing + write_off, &bytes[..left as usize]); + let right = count - left; + mem.write_bytes(backing, &bytes[left as usize..(left + right) as usize]); + right + } +} + +// ---- BitStream (port of canary `base/bit_stream.cc`). Big-endian source. + +pub struct BitStream<'a> { + buf: &'a [u8], + offset_bits: usize, + size_bits: usize, +} + +impl<'a> BitStream<'a> { + pub fn new(buf: &'a [u8], size_bits: usize) -> Self { + Self { buf, offset_bits: 0, size_bits } + } + pub fn offset_bits(&self) -> usize { + self.offset_bits + } + pub fn set_offset(&mut self, off: usize) { + self.offset_bits = off.min(self.size_bits); + } + pub fn advance(&mut self, n: usize) { + self.set_offset(self.offset_bits + n); + } + pub fn bits_remaining(&self) -> usize { + self.size_bits - self.offset_bits + } + /// Peek up to 57 bits (canary contract). Reads 8 bytes BE then shifts. + pub fn peek(&self, num_bits: usize) -> u64 { + debug_assert!(num_bits <= 57); + // offset_bytes = min(offset>>3, (size-64)>>3), matching canary so an + // 8-byte load near the buffer end stays in range. + let max_byte = if self.size_bits >= 64 { + (self.size_bits - 64) >> 3 + } else { + 0 + }; + let offset_bytes = (self.offset_bits >> 3).min(max_byte); + let rel = self.offset_bits - (offset_bytes << 3); + let mut tmp = [0u8; 8]; + let avail = self.buf.len().saturating_sub(offset_bytes).min(8); + tmp[..avail].copy_from_slice(&self.buf[offset_bytes..offset_bytes + avail]); + let mut value = u64::from_be_bytes(tmp); + value >>= 64 - (rel + num_bits); + value &= (1u64 << num_bits) - 1; + value + } + pub fn read(&mut self, num_bits: usize) -> u64 { + let v = self.peek(num_bits); + self.advance(num_bits); + v + } + /// Copy `num_bits` from the stream into `dest` (bit-packed, MSB-first within + /// each byte). Returns the starting bit offset within the first byte + /// (canary returns `rel_offset_bits` — the frame's intra-byte alignment). + pub fn copy(&mut self, dest: &mut [u8], num_bits: usize) -> usize { + let offset_bytes = self.offset_bits >> 3; + let rel = self.offset_bits - (offset_bytes << 3); + let mut bits_left = num_bits; + let mut out = 0usize; + + if rel != 0 { + let bits = self.peek(8 - rel) as u8; + let clear_mask = !(((1u8 << rel) - 1)) as u8; + dest[out] &= clear_mask; + dest[out] |= bits; + bits_left -= 8 - rel; + self.advance(8 - rel); + out += 1; + } + if bits_left >= 8 { + let nbytes = bits_left / 8; + let src_off = (self.offset_bits >> 3).min(self.buf.len()); + let copy = nbytes.min(self.buf.len().saturating_sub(src_off)); + dest[out..out + copy] + .copy_from_slice(&self.buf[src_off..src_off + copy]); + out += nbytes; + self.advance(nbytes * 8); + bits_left -= nbytes * 8; + } + if bits_left != 0 { + let mut b = self.peek(bits_left) as u8; + b <<= 8 - bits_left; + let clear_mask = ((1u16 << bits_left) - 1) as u8; + dest[out] &= clear_mask; + dest[out] |= b; + self.advance(bits_left); + } + rel + } +} + +// ---- XMA packet header helpers (canary `xma_helpers.h`). + +#[inline] +pub fn packet_frame_count(packet: &[u8]) -> u8 { + packet[0] >> 2 +} +#[inline] +pub fn packet_metadata(packet: &[u8]) -> u8 { + packet[2] & 0x7 +} +#[inline] +pub fn is_packet_xma2(packet: &[u8]) -> bool { + packet_metadata(packet) == 1 +} +#[inline] +pub fn packet_skip_count(packet: &[u8]) -> u8 { + packet[3] +} +/// First frame offset in bits (canary `GetPacketFrameOffset`): a 15-bit value +/// across bytes 0..2, plus the 32-bit header. +#[inline] +pub fn packet_frame_offset(packet: &[u8]) -> u32 { + let val = (((packet[0] as u32 & 0x3) << 13) + | ((packet[1] as u32) << 5) + | ((packet[2] as u32) >> 3)) + & 0xFFFF; + val + 32 +} + +/// Sample-rate id → Hz. +pub fn sample_rate_hz(id: u32) -> u32 { + ID_TO_SAMPLE_RATE[id.min(3) as usize] +} + +// ---- Packet-walk for faithful input-offset advance (canary `GetPacketInfo`, +// `GetNextPacketReadOffset`, and the offset arithmetic at the tail of +// `XmaContextNew::Decode`). These let us advance `input_buffer_read_offset` one +// *frame* at a time at canary's exact cadence — independent of the mainline +// xma2 decoder's whole-packet/burst framing — so the offset crosses packet and +// buffer boundaries (and triggers SwapInputBuffer) at the true input-drain +// rate the guest's WMV demuxer polls. + +/// Info about the frame at a given bit offset within a packet (canary +/// `kPacketInfo` / `GetPacketInfo`). `frame_count_` is the number of frames +/// that begin in the packet; `current_frame_size_` is the compressed bit size +/// of the frame at `frame_offset` (0 if it can't be resolved within this +/// packet — a split header). +#[derive(Default, Clone, Copy)] +pub struct PacketInfo { + pub frame_count: u32, + pub current_frame: u32, + pub current_frame_size: u32, +} + +impl PacketInfo { + pub fn is_last_frame_in_packet(&self) -> bool { + self.current_frame + 1 == self.frame_count + } +} + +/// Faithful port of canary `XmaContextNew::GetPacketInfo`. +pub fn get_packet_info(packet: &[u8], frame_offset: u32) -> PacketInfo { + let mut info = PacketInfo::default(); + let first_frame_offset = packet_frame_offset(packet); + let mut stream = BitStream::new(packet, BITS_PER_PACKET as usize); + stream.set_offset(first_frame_offset as usize); + + // Split frame from previous packet. + if frame_offset < first_frame_offset { + info.current_frame = 0; + info.current_frame_size = first_frame_offset - frame_offset; + } + + loop { + if stream.bits_remaining() < BITS_PER_FRAME_HEADER as usize { + break; + } + let frame_size = stream.peek(BITS_PER_FRAME_HEADER as usize) as u32; + if frame_size == 0 || frame_size == MAX_FRAME_LENGTH { + break; + } + if stream.offset_bits() == frame_offset as usize { + info.current_frame = info.frame_count; + info.current_frame_size = frame_size; + } + info.frame_count += 1; + if frame_size as usize > stream.bits_remaining() { + // Last frame. + break; + } + stream.advance((frame_size - 1) as usize); + // Trailing continuation bit. + if stream.read(1) == 0 { + break; + } + } + + if is_packet_xma2(packet) { + let xma2_frame_count = packet_frame_count(packet) as u32; + if xma2_frame_count > info.frame_count { + if info.current_frame_size == 0 { + info.current_frame = info.frame_count; + } + info.frame_count = xma2_frame_count; + } + } + info +} + +/// Packet number for a bit offset (canary `GetPacketNumber`). Returns None when +/// the offset is in the header or past the buffer. +pub fn packet_number(size_bytes: u32, bit_offset: u32) -> Option { + if bit_offset < BITS_PER_PACKET_HEADER { + return None; + } + if bit_offset >= size_bytes * 8 { + return None; + } + Some((bit_offset >> 3) / BYTES_PER_PACKET) +} + +/// min(remaining_stream_bits, frame_size) (canary `GetAmountOfBitsToRead`). +pub fn amount_of_bits_to_read(remaining_stream_bits: u32, frame_size: u32) -> u32 { + remaining_stream_bits.min(frame_size) +} + +// ---- Per-context decode state (lives in the XmaDecoder, one per ctx). + +#[derive(Default)] +pub struct ContextDecodeState { + /// FFmpeg xma2 codec for this context (lazily created / reconfigured). + pub codec: Option, + pub codec_rate: u32, + pub codec_channels: u32, + /// Staged interleaved S16BE PCM for the current decoded frame + /// (`raw_frame_`), drained by Consume in 256-byte blocks. + pub raw_frame: Vec, + /// Decoded interleaved S16BE PCM not yet split into per-frame `raw_frame`s. + /// The mainline xma2 decoder emits bursts of many 512-sample frames at once + /// (internal FIFO + 4096-sample lookahead); we queue the bytes here and + /// hand the guest exactly one 512-sample frame per `produce_frame`. + pub pcm_queue: VecDeque, + pub current_frame_remaining_subframes: u8, + pub remaining_subframe_blocks_in_output: i32, + /// Total 512-sample frames decoded for this context (diagnostic). + pub frames_decoded: u64, + /// Whether a "first frame" diagnostic has been emitted. + pub first_frame_logged: bool, + /// FFmpeg feed cursor: the next packet index (within the *current* input + /// buffer at feed time) we will hand to FFmpeg. This is the decoder's + /// internal intake position and is intentionally decoupled from the + /// guest-visible `input_buffer_read_offset` (which advances per *emitted* + /// frame via the faithful packet-walk). We feed ahead so FFmpeg always has + /// enough buffered input to satisfy the guest's drain, while the guest sees + /// the read offset move at canary's true per-frame cadence. + pub feed_packet_index: u32, + /// `current_buffer` the feed cursor is reading from; reset on swap so the + /// feed follows the same ping-pong as the guest-visible buffer. + pub feed_buffer: u32, +} + +#[cfg(test)] +mod tests { + use super::*; + + /// The bitfield unpack/pack must round-trip every decoder-relevant field at + /// the exact canary offsets (regression against a shifted bit). + #[test] + fn context_bitfields_round_trip() { + let mut c = XmaContextData::default(); + c.input_buffer_0_packet_count = 632; + c.loop_count = 0; + c.input_buffer_0_valid = 1; + c.input_buffer_1_valid = 0; + c.output_buffer_block_count = 30; + c.output_buffer_write_offset = 5; + c.subframe_decode_count = 8; + c.output_buffer_padding = 1; + c.sample_rate = 3; + c.is_stereo = 1; + c.output_buffer_valid = 1; + c.input_buffer_read_offset = 16416; + c.error_status = 4; + c.current_buffer = 1; + c.input_buffer_0_ptr = 0x0b9f_d000; + c.output_buffer_ptr = 0x01f6_6e00; + c.output_buffer_read_offset = 7; + c.interrupt_when_done = 1; + + // pack → words → re-read via the same word layout. + let d = c.pack(); + // Simulate read() decode from the packed words. + let mut c2 = XmaContextData::default(); + c2.input_buffer_0_packet_count = bits(d[0], 0, 12); + c2.input_buffer_0_valid = bits(d[0], 20, 1); + c2.output_buffer_block_count = bits(d[0], 22, 5); + c2.output_buffer_write_offset = bits(d[0], 27, 5); + c2.subframe_decode_count = bits(d[1], 20, 4); + c2.output_buffer_padding = bits(d[1], 24, 3); + c2.sample_rate = bits(d[1], 27, 2); + c2.is_stereo = bits(d[1], 29, 1); + c2.output_buffer_valid = bits(d[1], 31, 1); + c2.input_buffer_read_offset = bits(d[2], 0, 26); + c2.error_status = bits(d[2], 26, 5); + c2.current_buffer = bits(d[4], 31, 1); + c2.output_buffer_read_offset = bits(d[9], 0, 5); + c2.interrupt_when_done = bits(d[9], 31, 1); + + assert_eq!(c2.input_buffer_0_packet_count, 632); + assert_eq!(c2.input_buffer_0_valid, 1); + assert_eq!(c2.output_buffer_block_count, 30); + assert_eq!(c2.output_buffer_write_offset, 5); + assert_eq!(c2.subframe_decode_count, 8); + assert_eq!(c2.output_buffer_padding, 1); + assert_eq!(c2.sample_rate, 3); + assert_eq!(c2.is_stereo, 1); + assert_eq!(c2.output_buffer_valid, 1); + assert_eq!(c2.input_buffer_read_offset, 16416); + assert_eq!(c2.error_status, 4); + assert_eq!(c2.current_buffer, 1); + assert_eq!(c2.output_buffer_read_offset, 7); + assert_eq!(c2.interrupt_when_done, 1); + } + + #[test] + fn phys_to_backing_projects_physical_window() { + assert_eq!(xma_phys_to_backing(0x0b9f_d000), 0x4b9f_d000); + assert_eq!(xma_phys_to_backing(0x01f6_6e00), 0x41f6_6e00); + } + + #[test] + fn ring_write_count_matches_canary() { + // empty (read==write) → full capacity. + assert_eq!(ring_write_count(0, 0, 7680), 7680); + // write ahead of read. + assert_eq!(ring_write_count(0, 256, 7680), 7680 - 256); + // write wrapped behind read. + assert_eq!(ring_write_count(512, 256, 7680), 256); + } + + #[test] + fn packet_header_helpers() { + // Matches the observed first packet word 0x08000000: byte0=0x08. + let pkt = [0x08u8, 0x00, 0x00, 0x00]; + assert_eq!(packet_frame_count(&pkt), 2); // 0x08>>2 = 2 + // frame offset: ((0x08&3)<<13 | 0<<5 | 0x00>>3) + 32 = 32. + assert_eq!(packet_frame_offset(&pkt), 32); + // A non-zero byte2 shifts the offset: 0x08>>3 = 1 → +1. + let pkt2 = [0x08u8, 0x00, 0x08, 0x00]; + assert_eq!(packet_frame_offset(&pkt2), 33); + } +} + +impl ContextDecodeState { + pub fn new() -> Self { + Self { + codec: None, + codec_rate: 0, + codec_channels: 0, + raw_frame: vec![0u8; (BYTES_PER_FRAME_CHANNEL * 2) as usize], + pcm_queue: VecDeque::new(), + current_frame_remaining_subframes: 0, + remaining_subframe_blocks_in_output: 0, + frames_decoded: 0, + first_frame_logged: false, + feed_packet_index: 0, + feed_buffer: 0, + } + } +} diff --git a/crates/xenia-cpu/src/dispatch_rec.rs b/crates/xenia-cpu/src/dispatch_rec.rs new file mode 100644 index 0000000..1f2d25f --- /dev/null +++ b/crates/xenia-cpu/src/dispatch_rec.rs @@ -0,0 +1,217 @@ +//! Runtime indirect-dispatch recorder. +//! +//! A reusable, env-gated facility that captures every indirect call performed +//! through CTR (`bcctr`/`bcctrl`/`bctr`) as a unique `(call_site_pc -> +//! target_pc)` pair, together with the object register `r3` seen at the call +//! and a hit count. It exists to provide GROUND-TRUTH indirect-dispatch +//! resolution for reverse-engineering vtable dispatch that the static +//! analyzer fails to resolve (e.g. the Sylpheed movie engine vtable +//! `0x8200a908`). +//! +//! ## Gating & overhead +//! Recording is OFF by default. It is enabled only when the environment +//! variable `XENIA_DISPATCH_REC` is set to a non-empty, non-`0` value at +//! process start. When OFF, [`record`] is a single relaxed atomic-bool load +//! followed by an early return — no allocation, no locking, no behavior +//! change. The recorder is pure: it never reads the clock, never touches +//! scheduling, and never mutates guest/CPU state, so enabling it does not +//! perturb deterministic runs (only adds a HashMap insert behind a mutex). +//! +//! ## Focus filters (optional) +//! Two env vars narrow what is recorded (both default to "record everything"): +//! - `XENIA_DISPATCH_REC_TARGETS=0x82505c08,...` — only edges whose resolved +//! target is in the list. Answers "who calls ``": every recorded +//! edge then carries the caller `site` and `lr`. +//! - `XENIA_DISPATCH_REC_SITES=0x825078d8,...` — only edges from the listed +//! call-site PCs. +//! When both are set, an edge must satisfy BOTH. These keep a long focused +//! run (e.g. the intro-movie trace) producing a small, relevant table instead +//! of the whole program-wide dispatch set. Pure observe-only — filtering only +//! affects which edges are stored, never guest/CPU state. +//! +//! ## Output +//! On [`dump`] (call at end-of-run) the table is written to the path in +//! `XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`), sorted by +//! descending hit count, one record per line: +//! `callsite_pc target_pc count r3=` (all hex). + +use std::collections::HashMap; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Mutex; +use std::sync::OnceLock; + +/// Enabled flag, resolved once from the environment at first touch. +static ENABLED: OnceLock = OnceLock::new(); +/// Fast-path mirror of `ENABLED` so the hot path is a single relaxed load +/// (avoids the `OnceLock` get + deref on every indirect branch when OFF). +static ENABLED_FAST: AtomicBool = AtomicBool::new(false); + +/// One observed indirect-dispatch edge. +#[derive(Default, Clone, Copy)] +struct Edge { + count: u64, + /// Last-seen object register (`r3`) at this (site,target) edge. Stable for + /// a vtable dispatch where the same call site always dispatches on the + /// same kind of object. + last_r3: u64, + /// Last-seen link register (return address) for the call. + last_lr: u64, +} + +/// (call_site_pc, target_pc) -> Edge +static TABLE: OnceLock>> = OnceLock::new(); + +/// Optional focus filters, resolved once from the environment. When either is +/// non-empty, an edge is recorded only if its `target` is in `TARGET_FILTER` +/// (when that set is non-empty) AND its `site` is in `SITE_FILTER` (when that +/// set is non-empty). Empty sets mean "no constraint on that axis". This lets +/// a long focused run (e.g. the intro-movie trace) record ONLY the dispatch +/// edges relevant to a target-set under investigation — for example "every +/// indirect call whose target is the XMV submit `sub_82505C08`", which answers +/// the milestone-2 "who calls submit on the engine" question with the caller +/// `lr` — instead of the whole (large) program-wide dispatch table. +static TARGET_FILTER: OnceLock> = OnceLock::new(); +static SITE_FILTER: OnceLock> = OnceLock::new(); + +/// Parse a comma-separated list of hex PCs (`0x` prefix optional) into a +/// sorted, deduped Vec. Empty/garbage tokens are skipped. +fn parse_pc_list_str(s: &str) -> Vec { + let mut v: Vec = s + .split(',') + .map(str::trim) + .filter(|t| !t.is_empty()) + .filter_map(|t| { + let hex = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")).unwrap_or(t); + u32::from_str_radix(hex, 16).ok() + }) + .collect(); + v.sort_unstable(); + v.dedup(); + v +} + +/// Parse a PC list from an env var. Missing var → empty Vec (no constraint). +fn parse_pc_list(var: &str) -> Vec { + match std::env::var(var) { + Ok(s) => parse_pc_list_str(&s), + Err(_) => Vec::new(), + } +} + +/// Resolve the enabled flag (and focus filters) from the environment exactly +/// once. +fn init_enabled() -> bool { + let on = match std::env::var("XENIA_DISPATCH_REC") { + Ok(v) => !v.is_empty() && v != "0", + Err(_) => false, + }; + ENABLED_FAST.store(on, Ordering::Relaxed); + let _ = TARGET_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_TARGETS")); + let _ = SITE_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_SITES")); + on +} + +/// Whether recording is enabled. Cheap after the first call. +#[inline(always)] +pub fn enabled() -> bool { + // Hot path: relaxed atomic load. ENABLED_FAST is initialised by the first + // call to `enabled_init` (below); until then it is `false`, which is also + // the correct default. We force initialisation eagerly from `install`. + ENABLED_FAST.load(Ordering::Relaxed) +} + +/// Force the env resolution (call once early in startup). Idempotent. +pub fn install() { + let _ = ENABLED.get_or_init(init_enabled); +} + +/// Record one indirect (CTR) call edge. No-op when disabled. +/// +/// `site` = PC of the `bcctr`/`bctr` instruction, `target` = resolved CTR +/// target, `r3` = object register at the call, `lr` = link register. +#[inline(always)] +pub fn record(site: u32, target: u32, r3: u64, lr: u64) { + // Single predictable branch when OFF. + if !ENABLED_FAST.load(Ordering::Relaxed) { + return; + } + // Focus filters (only consulted when recording is ON, i.e. rare). An empty + // filter set imposes no constraint on its axis. + if let Some(targets) = TARGET_FILTER.get() + && !targets.is_empty() + && targets.binary_search(&target).is_err() + { + return; + } + if let Some(sites) = SITE_FILTER.get() + && !sites.is_empty() + && sites.binary_search(&site).is_err() + { + return; + } + let table = TABLE.get_or_init(|| Mutex::new(HashMap::new())); + if let Ok(mut t) = table.lock() { + let e = t.entry((site, target)).or_default(); + e.count += 1; + e.last_r3 = r3; + e.last_lr = lr; + } +} + +/// Dump the recorded table to the output file. No-op when disabled or empty. +pub fn dump() { + if !enabled() { + return; + } + let path = std::env::var("XENIA_DISPATCH_REC_OUT") + .unwrap_or_else(|_| "/tmp/dispatch_rec.txt".to_string()); + let table = match TABLE.get() { + Some(t) => t, + None => return, + }; + let guard = match table.lock() { + Ok(g) => g, + Err(_) => return, + }; + let mut rows: Vec<((u32, u32), Edge)> = + guard.iter().map(|(k, v)| (*k, *v)).collect(); + // Deterministic order: count desc, then site, then target. + rows.sort_by(|a, b| { + b.1.count + .cmp(&a.1.count) + .then(a.0 .0.cmp(&b.0 .0)) + .then(a.0 .1.cmp(&b.0 .1)) + }); + let mut out = String::with_capacity(rows.len() * 48); + out.push_str("# callsite_pc target_pc count r3 lr\n"); + for ((site, target), e) in rows { + out.push_str(&format!( + "{:#010x} {:#010x} {} r3={:#018x} lr={:#018x}\n", + site, target, e.count, e.last_r3, e.last_lr + )); + } + if let Err(err) = std::fs::write(&path, out) { + eprintln!("dispatch_rec: failed to write {}: {}", path, err); + } else { + eprintln!("dispatch_rec: wrote {} edges to {}", guard.len(), path); + } +} + +#[cfg(test)] +mod tests { + use super::parse_pc_list_str; + + #[test] + fn parse_pc_list_handles_prefixes_whitespace_and_dedup() { + // Mixed 0x / bare hex, surrounding whitespace, an empty token, and a + // duplicate. Result is sorted + deduped; garbage tokens are dropped. + let got = parse_pc_list_str(" 0x82505c08 , 825078d8,, 82505c08 , zzz "); + assert_eq!(got, vec![0x82505c08, 0x825078d8]); + } + + #[test] + fn parse_pc_list_empty_is_no_constraint() { + assert!(parse_pc_list_str("").is_empty()); + assert!(parse_pc_list_str(" , , ").is_empty()); + } +} diff --git a/crates/xenia-cpu/src/interpreter.rs b/crates/xenia-cpu/src/interpreter.rs index 85fb181..d1748b9 100644 --- a/crates/xenia-cpu/src/interpreter.rs +++ b/crates/xenia-cpu/src/interpreter.rs @@ -1012,7 +1012,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) - if cond_ok { let next_pc = ctx.pc + 4; - ctx.pc = (ctx.ctr as u32) & !3; + let target = (ctx.ctr as u32) & !3; + // Ground-truth indirect-dispatch recording (env-gated, off by + // default; pure record-only, no scheduling/state change). + if crate::dispatch_rec::enabled() { + crate::dispatch_rec::record(ctx.pc, target, ctx.gpr[3], ctx.lr); + } + ctx.pc = target; if instr.lk() { ctx.lr = next_pc as u64; } diff --git a/crates/xenia-cpu/src/lib.rs b/crates/xenia-cpu/src/lib.rs index 576a79e..b7bfa39 100644 --- a/crates/xenia-cpu/src/lib.rs +++ b/crates/xenia-cpu/src/lib.rs @@ -1,6 +1,7 @@ pub mod block_cache; pub mod context; pub mod decoder; +pub mod dispatch_rec; pub mod disasm; pub mod fpscr; pub mod interpreter; diff --git a/crates/xenia-cpu/src/scheduler.rs b/crates/xenia-cpu/src/scheduler.rs index d6b79a0..2aadc57 100644 --- a/crates/xenia-cpu/src/scheduler.rs +++ b/crates/xenia-cpu/src/scheduler.rs @@ -205,6 +205,21 @@ pub enum BlockReason { CriticalSection(u32), } +/// Floor of the **synthetic park-handle** range. Handles at or above this +/// value are deliberately OUTSIDE the kernel object allocator (which starts +/// at `0x1000`); they are used to park threads that must NEVER be woken by +/// the normal signal/wait machinery — currently the dedicated audio-worker +/// threads (`xenia_kernel::xaudio::XAUDIO_SYNTHETIC_HANDLE_BASE = 0xF000_0000`), +/// which are only ever un-parked by audio-callback injection. The deadlock +/// force-wake ([`Scheduler::unblock_on_deadlock`]) must skip waiters parked +/// solely on such handles: they are not deadlock participants (the guest +/// genuinely blocked on its own objects), and waking one runs its thread +/// entry to the `LR_HALT` sentinel → premature exit, which then drops every +/// subsequent injection. Kept in `xenia-cpu` (not imported from +/// `xenia-kernel`, which depends on this crate); the kernel const must stay +/// within `[SYNTHETIC_PARK_HANDLE_FLOOR, u32::MAX]`. +pub const SYNTHETIC_PARK_HANDLE_FLOOR: u32 = 0xF000_0000; + /// Sink for PCR+0x2C writes — the scheduler writes the guest-visible /// current-processor-id here at spawn and Axis 4 rewrites on affinity /// migration. Implemented by `xenia-kernel` for `GuestMemory`; keeping it @@ -1399,6 +1414,27 @@ impl Scheduler { let mut woken = Vec::new(); for (hw_id, slot) in self.slots.iter_mut().enumerate() { for (idx, t) in slot.runqueue.iter_mut().enumerate() { + // Skip threads parked SOLELY on synthetic park-handles + // (audio workers). They are not deadlock participants — the + // guest blocked on its own objects — and waking one runs its + // thread entry to the LR_HALT sentinel, exiting it and + // dropping every subsequent audio-callback injection. Only + // audio-callback injection may un-park them. A wait whose + // handle set mixes synthetic and real handles is still + // eligible (the real handle makes it a genuine waiter). + let synthetic_park = match &t.state { + HwState::Blocked(BlockReason::WaitAny { handles, .. }) + | HwState::Blocked(BlockReason::WaitAll { handles, .. }) => { + !handles.is_empty() + && handles + .iter() + .all(|&h| h >= SYNTHETIC_PARK_HANDLE_FLOOR) + } + _ => false, + }; + if synthetic_park { + continue; + } if matches!( t.state, HwState::Blocked(BlockReason::WaitAny { .. }) @@ -1485,6 +1521,41 @@ mod tests { } } + #[test] + fn unblock_on_deadlock_skips_synthetic_park_waiters() { + // The audio worker parks on a synthetic handle (>= FLOOR) and must + // survive the deadlock force-wake; a peer parked on a real handle + // must be woken. Regression for the milestone-2 stall where the + // force-wake destroyed the audio worker → all callbacks dropped. + let mut s = mk_scheduler_with_initial(); + s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap(); + s.spawn(worker_spawn_params(3, 0x2010), &mut NullPcr).unwrap(); + let audio = ThreadRef { hw_id: 1, idx: 0, generation: 0 }; + let real = ThreadRef { hw_id: 2, idx: 0, generation: 0 }; + s.thread_mut(audio).state = HwState::Blocked(BlockReason::WaitAny { + handles: vec![SYNTHETIC_PARK_HANDLE_FLOOR], + deadline: None, + }); + s.thread_mut(real).state = HwState::Blocked(BlockReason::WaitAny { + handles: vec![0x1234], + deadline: None, + }); + let woken = s.unblock_on_deadlock(); + assert!( + woken.contains(&real), + "real-handle waiter must be force-woken" + ); + assert!( + !woken.contains(&audio), + "synthetic-park audio worker must NOT be force-woken" + ); + assert!(matches!( + s.thread(audio).state, + HwState::Blocked(BlockReason::WaitAny { .. }) + )); + assert_eq!(s.thread(real).state, HwState::Ready); + } + // ---- preserved from pre-Axis-1 (updated names and params) ---- #[test] diff --git a/crates/xenia-kernel/Cargo.toml b/crates/xenia-kernel/Cargo.toml index f9dd134..464ef54 100644 --- a/crates/xenia-kernel/Cargo.toml +++ b/crates/xenia-kernel/Cargo.toml @@ -11,6 +11,7 @@ xenia-cpu = { workspace = true } xenia-vfs = { workspace = true } xenia-hid = { workspace = true } xenia-gpu = { workspace = true } +xenia-apu = { workspace = true } tracing = { workspace = true } metrics = { workspace = true } thiserror = { workspace = true } diff --git a/crates/xenia-kernel/src/exports.rs b/crates/xenia-kernel/src/exports.rs index d33ca7c..ecec0a4 100644 --- a/crates/xenia-kernel/src/exports.rs +++ b/crates/xenia-kernel/src/exports.rs @@ -182,7 +182,7 @@ pub fn register_exports(state: &mut KernelState) { state.register_export(Xboxkrnl, 0x01F7, "XAudioGetVoiceCategoryVolumeChangeMask", stub_return_zero); state.register_export(Xboxkrnl, 0x01F8, "XAudioGetVoiceCategoryVolume", stub_success); state.register_export(Xboxkrnl, 0x0224, "XMACreateContext", xma_create_context); - state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", stub_success); + state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", xma_release_context); // Crypto state.register_export(Xboxkrnl, 0x0192, "XeCryptSha", stub_success); @@ -3398,6 +3398,7 @@ fn xaudio_register_render_driver(ctx: &mut PpcContext, mem: &GuestMemory, state: callback_pc, callback_arg, wrapped_callback_arg: wrapped, + submitted_frames: 0, }; let Some(index) = state.xaudio.register(client) else { tracing::warn!("XAudioRegisterRenderDriverClient: client table full"); @@ -3506,18 +3507,75 @@ fn xaudio_unregister_render_driver(ctx: &mut PpcContext, _mem: &GuestMemory, sta ctx.gpr[3] = 0; } +/// Mirrors canary `XAudioSubmitRenderDriverFrame_entry` → +/// `AudioSystem::SubmitFrame(driver_ptr & 0xFFFF, samples)`: +/// the guest render-driver mixer (`sub_824DC350`) calls this once per audio +/// frame with `r3 = driver_id` (`0x4155_xxxx`) and `r4 = sample buffer`. +/// Canary forwards `samples` to the client's `AudioDriver`; the driver's +/// playback-completion callback later releases the client semaphore, which is +/// the buffer-consumed pacing our XAudio callback ticker +/// (`tick_instr` + `try_inject_audio_callback`) already drives. SubmitFrame +/// returns void and the caller discards r3 / reads no field SubmitFrame +/// writes, so faithfully we validate the client index and account the frame +/// (observational; never read back by the guest). Always returns +/// `X_ERROR_SUCCESS`, matching canary. Deterministic: only this guest-driven +/// export mutates state; no wall-clock, no host thread. fn xaudio_submit_render_driver_frame( ctx: &mut PpcContext, _mem: &GuestMemory, - _state: &mut KernelState, + state: &mut KernelState, ) { + let driver_id = ctx.gpr[3] as u32; + let index = (driver_id & XAUDIO_DRIVER_INDEX_MASK) as usize; + let registered = state.xaudio.record_submit(index); + if !registered { + // Canary logs and submits silence to keep the callback chain alive + // for an unregistered/invalid index; our ticker keeps the chain + // alive independently, so a debug log suffices. + tracing::debug!( + driver_id = format_args!("{driver_id:#010x}"), + index, + "XAudioSubmitRenderDriverFrame: unregistered client index" + ); + } else if state.xaudio.submitted_frames(index) == 1 { + tracing::info!( + driver_id = format_args!("{driver_id:#010x}"), + index, + "XAudioSubmitRenderDriverFrame: first frame submitted by guest mixer" + ); + } ctx.gpr[3] = 0; } -fn xma_create_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) { - let handle = state.alloc_handle(); - tracing::info!("XMACreateContext: handle={:#x}", handle); - ctx.gpr[3] = handle as u64; +/// Mirrors xenia-canary `XMACreateContext_entry(lpdword_t context_out_ptr)`: +/// allocate a context from the register-mapped array, write its guest pointer +/// to `*context_out_ptr`, and return `X_STATUS_SUCCESS` (or `X_STATUS_NO_MEMORY` +/// when the 320-slot array is exhausted). +fn xma_create_context(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) { + let out_ptr = ctx.gpr[3] as u32; + let context_ptr = state.xma.lock().unwrap().allocate_context(); + if out_ptr != 0 { + mem.write_u32(out_ptr, context_ptr); + } + tracing::info!( + out_ptr = format_args!("{out_ptr:#010x}"), + context_ptr = format_args!("{context_ptr:#010x}"), + "XMACreateContext" + ); + ctx.gpr[3] = if context_ptr == 0 { + 0xC000_0017 // X_STATUS_NO_MEMORY + } else { + 0 // X_STATUS_SUCCESS + }; +} + +/// Mirrors xenia-canary `XMAReleaseContext_entry(lpvoid_t context_ptr)`: +/// free the context slot and return 0. +fn xma_release_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) { + let context_ptr = ctx.gpr[3] as u32; + state.xma.lock().unwrap().release_context(context_ptr); + tracing::info!(context_ptr = format_args!("{context_ptr:#010x}"), "XMAReleaseContext"); + ctx.gpr[3] = 0; } // ===== Xex ===== @@ -4413,7 +4471,8 @@ fn nt_yield_execution(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut Ker } fn ke_resume_thread(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) { - let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32); + let raw = ctx.gpr[3] as u32; + let handle = resolve_pseudo_handle(state, raw); match state.scheduler.find_by_handle(handle) { Some(r) => { state.scheduler.resume_ref(r); @@ -4429,13 +4488,18 @@ fn nt_resume_thread(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelS // r3 = handle, r4 = prev_suspend_count_ptr let handle = ctx.gpr[3] as u32; let prev_ptr = ctx.gpr[4] as u32; - let prev = state - .scheduler - .find_by_handle(handle) - .map(|r| state.scheduler.resume_ref(r)) - .unwrap_or(0); - if prev_ptr != 0 { - mem.write_u32(prev_ptr, prev); + match state.scheduler.find_by_handle(handle) { + Some(r) => { + let prev = state.scheduler.resume_ref(r); + if prev_ptr != 0 { + mem.write_u32(prev_ptr, prev); + } + } + None => { + if prev_ptr != 0 { + mem.write_u32(prev_ptr, 0); + } + } } ctx.gpr[3] = STATUS_SUCCESS; } diff --git a/crates/xenia-kernel/src/state.rs b/crates/xenia-kernel/src/state.rs index 3967de0..50bb967 100644 --- a/crates/xenia-kernel/src/state.rs +++ b/crates/xenia-kernel/src/state.rs @@ -161,6 +161,11 @@ pub struct KernelState { /// graphics interrupts is enforced by the injector's /// `is_in_callback()` guard. pub xaudio: crate::xaudio::XAudioState, + /// Register-mapped XMA context array (apu stage 1). Shared with the + /// `0x7FEA0000` MMIO region installed by the app and with the + /// `XMACreateContext`/`XMAReleaseContext` exports, so it lives behind an + /// `Arc>`. Stage 1 records kicks; stage 3 will decode them. + pub xma: std::sync::Arc>, /// AUDIT-032 Plan B (default true). When true, the round prologue /// runs the XAudio ticker + `try_inject_audio_callback`. Pre-fix this /// was off by default because injection used random-victim selection @@ -449,6 +454,9 @@ impl KernelState { ui: None, interrupts: crate::interrupts::InterruptState::default(), xaudio: crate::xaudio::XAudioState::default(), + // apu stage 1 — un-initialized until the app reserves the context + // array and calls `xma.lock().init(va, phys)`. + xma: std::sync::Arc::new(std::sync::Mutex::new(xenia_apu::XmaDecoder::new())), // AUDIT-032: dedicated audio worker per client (Plan B in // `xaudio_register_render_driver`) — not victim hijack, so safe // to enable by default. Previously gated off because the diff --git a/crates/xenia-kernel/src/xaudio.rs b/crates/xenia-kernel/src/xaudio.rs index c20fe94..98704d6 100644 --- a/crates/xenia-kernel/src/xaudio.rs +++ b/crates/xenia-kernel/src/xaudio.rs @@ -35,6 +35,14 @@ pub const XAUDIO_MAX_CLIENTS: usize = 8; /// no-op anyway). pub const XAUDIO_SYNTHETIC_HANDLE_BASE: u32 = 0xF000_0000; +/// The scheduler's deadlock force-wake skips waiters parked solely on +/// handles at/above [`xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR`] +/// so it never destroys a parked audio worker. Keep these in lockstep: +/// every `synthetic_park_handle` must fall inside that protected range. +const _: () = assert!( + XAUDIO_SYNTHETIC_HANDLE_BASE >= xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR +); + /// Compute the synthetic park-handle for client slot `i`. pub const fn synthetic_park_handle(i: usize) -> u32 { XAUDIO_SYNTHETIC_HANDLE_BASE | (i as u32) @@ -68,6 +76,16 @@ pub struct XAudioClient { /// [audio_system.cc:225-228](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L225-L228) /// + [audio_system.cc:139-141](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L139-L141). pub wrapped_callback_arg: u32, + /// Count of frames the guest has handed us via + /// `XAudioSubmitRenderDriverFrame` for this client. Canary's + /// `AudioSystem::SubmitFrame` forwards the sample buffer to the client's + /// driver, whose playback completion later releases the client semaphore + /// — the pacing our callback ticker emulates. The guest mixer + /// (`sub_824DC350`) discards SubmitFrame's return and reads no field it + /// writes, so this counter is purely observational (logging / liveness), + /// never read back by the guest. Deterministic: incremented only inside + /// the guest-driven export call. + pub submitted_frames: u64, } #[derive(Debug)] @@ -138,6 +156,35 @@ impl XAudioState { self.clients.get(index).copied().flatten() } + /// Faithful counterpart to canary `AudioSystem::SubmitFrame`: the guest + /// driver client `index` handed us one frame of samples. Canary forwards + /// `samples` to the client's `AudioDriver`, whose playback-completion + /// callback later releases the client semaphore — the buffer-consumed + /// pacing our [`tick_instr`]/[`try_inject_audio_callback`] path already + /// emulates. SubmitFrame itself returns void and the guest mixer + /// (`sub_824DC350`) reads no field from it, so all we faithfully need to + /// do is validate the client and account the frame. Returns `true` iff + /// `index` is a registered client (canary submits silence / warns + /// otherwise). Deterministic — only the guest-driven export mutates this. + pub fn record_submit(&mut self, index: usize) -> bool { + match self.clients.get_mut(index) { + Some(Some(c)) => { + c.submitted_frames = c.submitted_frames.saturating_add(1); + true + } + _ => false, + } + } + + pub fn submitted_frames(&self, index: usize) -> u64 { + self.clients + .get(index) + .copied() + .flatten() + .map(|c| c.submitted_frames) + .unwrap_or(0) + } + pub fn any_registered(&self) -> bool { self.clients.iter().any(|c| c.is_some()) } @@ -230,6 +277,7 @@ mod tests { callback_pc: 0x8200_0000 + arg, callback_arg: arg, wrapped_callback_arg: 0x4000_0000 + arg, + submitted_frames: 0, } }