[iterate-4A] Milestone-2: XMA audio decoder + RE tooling (dispatch recorder, analyzer vtable-fix, non-perturbing probes)

Milestone-2 (intro video dat/movie/ADV.wmv) audio path + major RE tooling. XMA AUDIO (built, working, deterministic, tested): - APU MMIO 0x7FEA0000 + 320x64B register-mapped context array; real XMACreateContext/Release (xma.rs); real FFmpeg xma2 decoder XMA_CONTEXT_DATA->S16BE PCM (xma_decode.rs, xma2_codec.rs, ffmpeg-sys-next). Decode runs synchronously on the CPU thread (deterministic, no host thread). - Audio-worker scheduler fix (main.rs LR_HALT restore + scheduler.rs): the XAudio render-callback worker was wrongly exited after ~2 deliveries; now survives -> guest drives XMA decode (70 kicks). - XAudioSubmitRenderDriverFrame made faithful. Golden sylpheed_n50m re-baselined; tests pass. RE TOOLING: - Runtime indirect-dispatch recorder (dispatch_rec.rs): records (call-site->target, r3, lr); env-gated XENIA_DISPATCH_REC, filters XENIA_DISPATCH_REC_TARGETS/_SITES; deterministic, observe-only. - Repaired static analyzer (vtables.rs): vtable extraction silently fragmented vtables with non-function head slots (missed the XMV engine vtable). Fixed via vptr-write-anchoring -> engine fully typed (vtables 722->1150 on rebuild). - Fixed probe HEISENBUG (main.rs run_superblock): --audit-pc-probe-hex/--mem-watch no longer disable superblock chaining; probes fire inside the chain loop -> scheduling identical armed-vs-unarmed, movie subsystem now observable. Fixed a --quiet bug swallowing armed trace reports. VIDEO still doesn't play (B, guest-side): the XMV engine never issues begin-playback (sub_825076F0, vtable 0x8200a1e8 slot21) -> never primes -> 2000ms timeout. Narrowed to the ARM2 engine-setup wrappers; no honest our-side gate-fix (masking forbidden). See HANDOFF-iterate-4A-milestone2.md for new-machine setup (incl. the FFmpeg apt deps + sylpheed.db regeneration) and continuation pointers. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 21:38:19 +02:00
parent acb29db444
commit 23189b95af
19 changed files with 3106 additions and 46 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,7 @@ audit-*.md
 # working dir by the Wine canary build)
 vkd3d-proton.cache*
 *.dxvk-cache
+
+# local analysis-DB backups (regenerable; too large to track)
+*.db.bak*
+sylpheed.db.bak-*
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -418,6 +418,26 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"

+[[package]]
+name = "bindgen"
+version = "0.64.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4"
+dependencies = [
+ "bitflags 1.3.2",
+ "cexpr",
+ "clang-sys",
+ "lazy_static",
+ "lazycell",
+ "peeking_take_while",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash 1.1.0",
+ "shlex",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.6.0"
@@ -600,6 +620,15 @@ dependencies = [
 "shlex",
 ]

+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -639,6 +668,17 @@ dependencies = [
 "inout",
 ]

+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
 [[package]]
 name = "clap"
 version = "4.6.0"
@@ -1076,6 +1116,20 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"

+[[package]]
+name = "ffmpeg-sys-next"
+version = "6.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2529ad916d08c3562c754c21bc9b17a26c7882c0f5706cc2cd69472175f1620"
+dependencies = [
+ "bindgen",
+ "cc",
+ "libc",
+ "num_cpus",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "filetime"
 version = "0.2.27"
@@ -1317,6 +1371,12 @@ dependencies = [
 "xml-rs",
 ]

+[[package]]
+name = "glob"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
+
 [[package]]
 name = "glow"
 version = "0.13.1"
@@ -1898,6 +1958,12 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"

+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
 [[package]]
 name = "lexical-core"
 version = "1.0.6"
@@ -2139,6 +2205,12 @@ dependencies = [
 "sketches-ddsketch",
 ]

+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
 [[package]]
 name = "miniz_oxide"
 version = "0.8.9"
@@ -2262,6 +2334,16 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
 [[package]]
 name = "nu-ansi-term"
 version = "0.50.3"
@@ -2325,6 +2407,16 @@ dependencies = [
 "libm",
 ]

+[[package]]
+name = "num_cpus"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
 [[package]]
 name = "num_enum"
 version = "0.7.6"
@@ -2657,6 +2749,12 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"

+[[package]]
+name = "peeking_take_while"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
+
 [[package]]
 name = "percent-encoding"
 version = "2.3.2"
@@ -4961,8 +5059,10 @@ dependencies = [
 name = "xenia-apu"
 version = "0.1.0"
 dependencies = [
+ "ffmpeg-sys-next",
 "thiserror 2.0.18",
 "tracing",
+ "xenia-memory",
 "xenia-types",
 ]

@@ -5025,6 +5125,7 @@ dependencies = [
 "metrics",
 "thiserror 2.0.18",
 "tracing",
+ "xenia-apu",
 "xenia-cpu",
 "xenia-gpu",
 "xenia-hid",
--- a/HANDOFF-iterate-4A-milestone2.md
+++ b/HANDOFF-iterate-4A-milestone2.md
@@ -0,0 +1,133 @@
+# Handoff — branch `iterate-4A/apu-xma-stage1` (Milestone 2: intro-video / XMA audio + RE tooling)
+
+Reverse-engineering Project Sylpheed under this Rust Xbox-360 emulator (`xenia-rs`), using Wine
+xenia-canary as the ground-truth oracle. This branch carries **Milestone 2** work plus major
+RE-tooling improvements, on top of the (uncommitted-until-now) Milestone-1 renderer history.
+
+> Method: first-divergence vs canary · fix causes not symptoms · NO faking/masking · measure the
+> oracle, never infer · refute before believing · ground every claim in evidence.
+
+---
+
+## 0. SET UP ON A NEW MACHINE (do this first)
+
+### a) FFmpeg system libraries — **REQUIRED to build** (crate `xenia-apu` links them via pkg-config)
+The XMA audio decoder uses `ffmpeg-sys-next` (`crates/xenia-apu/Cargo.toml`:
+`ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] }`),
+which links the **system** FFmpeg dev libraries. Install them:
+
+```bash
+sudo apt update
+sudo apt install -y libavcodec-dev libavformat-dev libavutil-dev libswresample-dev pkg-config ffmpeg
+```
+
+Verify the toolchain (the XMA path needs the `xma1`/`xma2` decoders — present in distro FFmpeg ≥ ~2015):
+```bash
+pkg-config --modversion libavcodec          # expect 60.x (this branch built against 60.31)
+ffmpeg -hide_banner -decoders | grep -iE 'xma1|xma2'   # expect: A....D xma1 / A....D xma2
+```
+(Decoder note: distro FFmpeg has **no** `AV_CODEC_ID_XMAFRAMES`; we use `AV_CODEC_ID_XMA2` — see
+`crates/xenia-apu/src/xma2_codec.rs`.) On non-Debian distros install the equivalent `-dev` packages.
+
+### b) The game ISO (gitignored — `*.iso`)
+Not in the repo. Place the Project Sylpheed ISO somewhere and create a `sylpheed.iso` symlink to it
+in the repo root (the run/test commands use `sylpheed.iso`):
+```bash
+ln -s "/path/to/Project Sylpheed - Arc of Deception (USA, Europe) (En,Ja).iso" sylpheed.iso
+```
+⚠️ For **canary** runs, point at the REAL ISO path, not the symlink (Wine can't resolve the symlink).
+
+### c) Build — **always cap parallelism** (a default `-j` build OOM-crashed a 15 GB box)
+```bash
+export CARGO_BUILD_JOBS=4        # NEVER default -j12; check `free -h` first, drop to -j2 if <4GB free
+cargo build --release
+```
+
+### d) Regenerate the static-analysis DB `sylpheed.db` (gitignored — `*.db`, ~586 MB, ~1h35m)
+Used by the RE/analysis queries (NOT needed to run the emulator). Rebuild from the ISO:
+```bash
+cargo run --release -- dis "/path/to/<the ISO>" --db sylpheed.db
+# analysis passes run in <1s; the ~1h35m is DuckDB persisting ~1.8M dispatch rows. Be patient.
+```
+This branch's analyzer fix (see §3) makes the regenerated DB include the previously-missing XMV
+engine vtables (`0x8200a1e8`/`0x8200a908`). A local pre-fix backup may exist as
+`sylpheed.db.bak-pre-vtablefix` (gitignored, not pushed).
+
+---
+
+## 1. WHAT'S ON THIS BRANCH (all in this one commit, on top of `acb29db` = iterate-3AL)
+**Milestone-1 renderer history** (publisher/dev splash renders) is in the ancestry (iterate-2x → 3M →
+3O → 3AL); pushing this branch carries it. **Milestone 2** + tooling added here:
+
+### ✅ XMA AUDIO path — BUILT, WORKING, deterministic, tested
+- `crates/xenia-apu/src/xma.rs` — register-mapped XMA context system (MMIO `0x7FEA0000`, 320×64B
+  context array, Kick/Lock/Clear decode). `xma_decode.rs` + `xma2_codec.rs` — the real FFmpeg
+  `xma2` decoder (XMA_CONTEXT_DATA bitfields, BitStream packet parse, planar-f32→S16BE PCM).
+  Decode runs synchronously on the CPU thread (deterministic, no host thread). Wired via
+  `KernelState.xma` (`state.rs`), exports (`exports.rs`), `xaudio.rs` (`XAudioSubmitRenderDriverFrame`
+  made faithful), `main.rs` (MMIO install + per-round pump).
+- **Audio-worker scheduler fix** (`main.rs` LR_HALT restore + `scheduler.rs`): the XAudio render
+  callback worker was wrongly exited after ~2 deliveries → fixed → the guest now drives XMA decode.
+- Verified: real PCM out; golden `sylpheed_n50m` **re-baselined** (`crates/xenia-app/tests/golden/`)
+  and PASSES; milestone-1 splash intact; apu/cpu/kernel tests pass.
+
+### 🛠️ RE TOOLING (this branch's lasting wins)
+- **Runtime dispatch-recorder** `crates/xenia-cpu/src/dispatch_rec.rs` — records `(call-site → target,
+  r3, lr)` for every indirect (`bcctr`-family) call. Off by default; enable with `XENIA_DISPATCH_REC=1`,
+  optional filters `XENIA_DISPATCH_REC_TARGETS=<hex,…>` / `_SITES=<hex,…>`, dumps to
+  `XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`). Deterministic, observe-only.
+- **Repaired static analyzer** `crates/xenia-analysis/src/vtables.rs` — the vtable extractor silently
+  **fragmented vtables with non-function head slots** (missed the XMV engine vtable entirely →
+  blocked ~6 investigations). Fixed via **vptr-write-anchoring** (find `addis/addi → stw rX,0(rThis)`
+  constant-vptr installs; read the fnptr run from each anchor). Result on rebuild: vtables 722→1150,
+  dispatch candidates 688K→1.83M, engine fully typed. (Requires the §0d DB rebuild to take effect.)
+- **Probe Heisenbug FIXED** (`main.rs run_superblock`) — `--audit-pc-probe-hex` / `--mem-watch` used to
+  **disable superblock chaining**, which changed thread scheduling and *starved the movie subsystem*
+  so the probes couldn't observe it. Now probes fire *inside* the chain loop → scheduling is identical
+  armed-vs-unarmed (verified byte-identical golden) → the probe suite is finally usable on the movie
+  subsystem. Also fixed a `--quiet` bug that swallowed armed `--trace-handles`/`--dump-addr` reports.
+
+---
+
+## 2. CURRENT STATE & WHERE TO CONTINUE (the video still doesn't play)
+**Audio works; the intro VIDEO doesn't play yet.** Root, runtime-pinned: a 2000ms readiness timeout
+(`sub_821B66B8`) abandons because the XMV engine (`0x40d101c0`, runtime vtable `0x8200a1e8`) never
+**primes** — engine begin-playback `sub_825076F0` (slot 21) is **never dispatched** (0×), so the
+per-frame full-start always takes its skip branch and the playback clock never starts.
+- **Classification: (B) guest-side state machine.** The gate fields are the engine's *correct* reset
+  defaults → there is **NO honest our-side fix at the gate** (forcing them = masking, forbidden). The
+  defect is upstream: the guest SM reaches "create decoder (success)" but never issues begin-playback.
+- **Latest narrowing (evidence, fixed probes):** ARM2-setup `sub_821B55D8` runs once, create-decoder
+  `sub_824F8398` succeeds, and ARM2 then calls engine-setup wrappers
+  **`sub_824F7778` / `sub_824F7630` / `sub_824F7558` / `sub_824F7538` / `sub_824FCB68`** (on
+  `[movie+104]`=engine) — the begin-playback dispatch is gated **inside one of these**. Tracing them
+  (now possible with the fixed probes) for the begin-playback gate + why ours never satisfies it is
+  **the next step**. The likely ultimate unlock is **measuring canary** (same XEX reaches begin-playback)
+  to find the upstream state/signal we don't produce.
+
+Full, evidence-grounded detail (engine/vtable/slot map, the eliminations, the investigation arc, the
+method lessons) lives in the agent-memory grounding file referenced in the project memory index
+(`milestone2_xma_grounding`). Key anchors: engine `0x40d101c0` vtable `0x8200a1e8` — PUMP slot19
+`sub_825078D8`, begin-playback slot21 `sub_825076F0`, submit slot27 `sub_82505C08`, full-start slot40
+`sub_825061E0`; movie host `0x40bb0440` (engine at `[host+104]`); SM ARM1 `sub_821B4C98` → ARM2
+`sub_821B55D8` → ARM3 `sub_821B5FB8` → poll `sub_821B66B8`.
+
+### Useful commands
+```bash
+# Headless run to the video state (~30-40s, ~1B instr); add diagnostic flags as needed:
+./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
+# Non-perturbing PC probes (now usable on the movie subsystem):
+RUST_LOG=warn,xenia_apu=info XENIA_AUDIT_PC_PROBE=0x825078d8,0x82505c08 \
+  ./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
+# Dispatch recorder (filtered):
+XENIA_DISPATCH_REC=1 XENIA_DISPATCH_REC_TARGETS=0x825076f0,0x82505c08 \
+  ./target/release/xenia-rs exec sylpheed.iso -n 6000000000 --quiet
+# Golden / determinism check:
+CARGO_BUILD_JOBS=4 cargo test -p xenia-app --release --test sylpheed_oracles -- --ignored sylpheed_n50m
+# Visual (watch the splash; ASK a human to watch — never self-screenshot):
+./target/release/xenia-rs exec sylpheed.iso --ui
+```
+⚠️ Probe/run discipline: kill background runs by pid or `pkill -x xenia-rs` (NEVER `pkill -f`, it
+self-matches the launcher). Runs are deterministic (instruction-count clock).
+
+🤖 Generated with [Claude Code](https://claude.com/claude-code)
--- a/crates/xenia-analysis/src/vtables.rs
+++ b/crates/xenia-analysis/src/vtables.rs
@@ -26,6 +26,14 @@ use xenia_xex::pe::PeSection;

 use crate::demangle;

+/// Maximum number of consecutive non-function slots tolerated inside an
+/// anchor-recovered vtable before the run is considered terminated. MSVC
+/// vtables can carry null / pure-virtual / unrecognised-thunk slots in their
+/// head or interior; a small budget lets those through without merging two
+/// physically-adjacent vtables. Kept small to avoid bridging the gap between
+/// distinct tables.
+const MAX_ANCHOR_GAP: usize = 2;
+
 /// One detected vtable.
 #[derive(Debug, Clone)]
 pub struct Vtable {
@@ -56,6 +64,35 @@ pub fn analyze(
    image_base: u32,
    sections: &[PeSection],
    function_starts: &std::collections::BTreeSet<u32>,
+) -> Vec<Vtable> {
+    analyze_with_anchors(pe, image_base, sections, function_starts, &std::collections::BTreeSet::new())
+}
+
+/// Like [`analyze`], but additionally recovers vtables whose base address is
+/// known a-priori from a constructor vptr-write store (an "anchor"). The
+/// contiguity heuristic in pass 1 fragments any vtable whose head region
+/// contains words that don't resolve to recognised function entries (null /
+/// pure-virtual / unrecognised thunk slots); those vtables are never emitted
+/// and the downstream typed-dispatch resolver can't type objects of that
+/// class. An anchor is a *content-independent* vtable signal — the ctor
+/// literally installs `vtable_base` into `this+0` via
+/// `addis/addi (or lis/ori) → stw rX, 0(rThis)` — so for every anchor not
+/// already covered by a pass-1 run we synthesise a vtable starting at that
+/// base, reading the fnptr-array run while *tolerating* up to
+/// [`MAX_ANCHOR_GAP`] consecutive non-function slots before terminating.
+///
+/// `anchors` are absolute VAs of vtable bases (from
+/// [`scan_vptr_write_constants`]). Existing pass-1 vtables are kept unchanged
+/// (no regression): an anchor that already coincides with a detected vtable
+/// base is skipped, and an anchor that lands *inside* an existing run is also
+/// skipped (it's a sub-object pointer, not a fresh table).
+#[tracing::instrument(skip_all, fields(image_base = format_args!("{:#010x}", image_base)))]
+pub fn analyze_with_anchors(
+    pe: &[u8],
+    image_base: u32,
+    sections: &[PeSection],
+    function_starts: &std::collections::BTreeSet<u32>,
+    anchors: &std::collections::BTreeSet<u32>,
 ) -> Vec<Vtable> {
    let started = std::time::Instant::now();
    // Sections we'll scan for vtable bodies.
@@ -117,6 +154,120 @@ pub fn analyze(
        let _ = (va_start, va_end);
    }

+    // --- Anchor-driven recovery (vptr-write-anchored vtables) ---
+    //
+    // Build a coverage interval set from pass-1 runs so we don't re-emit a
+    // table for an anchor that already lies within an extracted vtable.
+    let mut covered: Vec<(u32, u32)> = candidates
+        .iter()
+        .map(|v| (v.address, v.address + v.length * 4))
+        .collect();
+    covered.sort_unstable();
+
+    let is_covered = |addr: u32, covered: &[(u32, u32)]| -> bool {
+        covered.iter().any(|&(s, e)| addr >= s && addr < e)
+    };
+
+    // Section lookup for "which scan target contains this VA?"
+    let scan_targets_va: Vec<(u32, u32, usize, usize)> = sections
+        .iter()
+        .filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
+        .map(|s| {
+            let va = image_base + s.virtual_address;
+            (
+                va,
+                va + s.virtual_size,
+                s.virtual_address as usize,
+                (s.virtual_address + s.virtual_size) as usize,
+            )
+        })
+        .collect();
+
+    // Cap a recovered run at the *next anchor* so two physically-adjacent
+    // anchored vtables don't merge. We deliberately do NOT cap at pass-1
+    // fragments: a fragment is a sub-run the contiguity scan carved out of a
+    // larger table, and the anchor legitimately re-absorbs it (subsumed
+    // fragments are removed afterwards).
+    let anchor_bases: std::collections::BTreeSet<u32> = anchors.iter().copied().collect();
+
+    let mut recovered = 0usize;
+    let mut newly: Vec<Vtable> = Vec::new();
+    for &anchor in anchors {
+        if is_covered(anchor, &covered) { continue; }
+        // Locate the containing .rdata/.data section.
+        let Some(&(va_lo, va_hi, raw_lo, raw_hi)) =
+            scan_targets_va.iter().find(|&&(lo, hi, _, _)| anchor >= lo && anchor < hi)
+        else { continue };
+        if anchor % 4 != 0 { continue; }
+        let raw_hi = raw_hi.min(pe.len());
+        // Read the fnptr-array run starting at the anchor. Tolerate small
+        // gaps of non-function slots (null / pure-virtual / unrecognised),
+        // but require the run to actually contain at least one real function
+        // (otherwise it's just data, not a vtable).
+        let next_base = anchor_bases.range((anchor + 4)..).next().copied();
+        let mut methods: Vec<u32> = Vec::new();
+        let mut gap = 0usize;
+        let mut real_fns = 0usize;
+        let mut off = (anchor - va_lo) as usize + raw_lo;
+        let mut va = anchor;
+        while off + 4 <= raw_hi && va < va_hi {
+            if let Some(nb) = next_base && va >= nb { break; }
+            let val = u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]);
+            if function_starts.contains(&val) {
+                methods.push(val);
+                real_fns += 1;
+                gap = 0;
+            } else {
+                // A non-function slot. Keep the slot (so downstream slot
+                // indexing stays aligned) but count toward the gap budget.
+                gap += 1;
+                if gap > MAX_ANCHOR_GAP {
+                    // Drop the trailing gap slots — they belong past the
+                    // table's end.
+                    methods.truncate(methods.len().saturating_sub(gap - 1));
+                    break;
+                }
+                methods.push(val);
+            }
+            off += 4;
+            va += 4;
+        }
+        // Trim any trailing non-function slots (the table ends at its last
+        // real method).
+        while methods.last().is_some_and(|&m| !function_starts.contains(&m)) {
+            methods.pop();
+        }
+        if real_fns == 0 || methods.is_empty() { continue; }
+        let length = methods.len() as u32;
+        newly.push(Vtable {
+            address: anchor,
+            length,
+            col_address: None,
+            class_name: synth_anon_name(&methods),
+            rtti_present: false,
+            base_classes_json: None,
+            methods,
+        });
+        recovered += 1;
+    }
+    if recovered > 0 {
+        // Drop pass-1 fragments fully subsumed by a recovered (anchored)
+        // vtable — the anchor base is authoritative and the fragment was a
+        // contiguity-scan artifact of the same table. Keep fragments that
+        // only partially overlap (defensive; shouldn't happen for true
+        // sub-runs) so we never lose method coverage.
+        let recovered_spans: Vec<(u32, u32)> =
+            newly.iter().map(|v| (v.address, v.address + v.length * 4)).collect();
+        candidates.retain(|v| {
+            !recovered_spans
+                .iter()
+                .any(|&(s, e)| v.address >= s && v.address + v.length * 4 <= e)
+        });
+        candidates.extend(newly);
+        tracing::info!(recovered, "vtables recovered from vptr-write anchors");
+    }
+    let _ = &covered;
+
    // RTTI walk: for each candidate, look at vtable[-1].
    let pe_image_base = image_base;
    for v in &mut candidates {
@@ -268,6 +419,98 @@ fn read_class_hierarchy(
    serde_json::to_string(&names).ok()
 }

+/// Pre-pass: discover candidate vtable *bases* from constructor vptr-write
+/// stores, independent of the static contiguity heuristic. A vptr install is
+/// the canonical `addis/addi` (or `lis/ori`) immediate build of a constant
+/// pointing into `.rdata` / `.data`, followed by `stw rX, 0(rThis)` — i.e. the
+/// ctor writing the vtable pointer to `this+0`. We return the set of such
+/// constants; these are fed to [`analyze_with_anchors`] so a vtable with
+/// non-function head words isn't lost.
+///
+/// We only consider stores at displacement 0 (the primary vptr; secondary
+/// MI vptrs land at non-zero offsets and are handled by the existing
+/// contiguity scan / typed-dispatch resolver well enough). The register
+/// tracker mirrors the lis+addi propagation used elsewhere and is reset at
+/// every basic-block boundary (`block_boundaries`).
+pub fn scan_vptr_write_constants(
+    pe: &[u8],
+    image_base: u32,
+    functions: &std::collections::BTreeMap<u32, (u32, bool)>, // start -> (end, is_saverestore)
+    sections: &[PeSection],
+    block_boundaries: &std::collections::HashSet<u32>,
+) -> std::collections::BTreeSet<u32> {
+    // Ranges that a vtable base may legitimately live in.
+    let data_ranges: Vec<(u32, u32)> = sections
+        .iter()
+        .filter(|s| matches!(s.name.as_str(), ".rdata" | ".data"))
+        .map(|s| (image_base + s.virtual_address, image_base + s.virtual_address + s.virtual_size))
+        .collect();
+    let in_data = |a: u32| data_ranges.iter().any(|&(s, e)| a >= s && a < e);
+
+    const OP_ADDI: u32 = 14;
+    const OP_ADDIS: u32 = 15;
+    const OP_ORI: u32 = 24;
+    const OP_STW: u32 = 36;
+    const OP_X_FORM: u32 = 31;
+
+    let read = |addr: u32| -> Option<u32> {
+        let off = addr.wrapping_sub(image_base) as usize;
+        if off + 4 > pe.len() { return None; }
+        Some(u32::from_be_bytes([pe[off], pe[off + 1], pe[off + 2], pe[off + 3]]))
+    };
+
+    let mut anchors: std::collections::BTreeSet<u32> = std::collections::BTreeSet::new();
+    for (&fn_start, &(fn_end, is_saverestore)) in functions {
+        if is_saverestore { continue; }
+        let mut reg: [Option<u32>; 32] = [None; 32];
+        let mut pc = fn_start;
+        while pc < fn_end {
+            if pc != fn_start && block_boundaries.contains(&pc) {
+                reg = [None; 32];
+            }
+            let Some(instr) = read(pc) else { break };
+            let op = instr >> 26;
+            let rd = ((instr >> 21) & 0x1F) as usize;
+            let ra = ((instr >> 16) & 0x1F) as usize;
+            let simm = ((instr & 0xFFFF) as i16) as i32;
+            let uimm = instr & 0xFFFF;
+            match op {
+                OP_ADDIS if ra == 0 => reg[rd] = Some(uimm << 16),
+                OP_ADDIS => reg[rd] = reg[ra].map(|b| b.wrapping_add(uimm << 16)),
+                OP_ADDI if ra != 0 => reg[rd] = reg[ra].map(|b| b.wrapping_add(simm as u32)),
+                OP_ADDI => reg[rd] = Some(simm as u32),
+                OP_ORI => {
+                    let rs = rd;
+                    reg[ra] = reg[rs].map(|b| b | uimm);
+                }
+                OP_STW => {
+                    // `stw rS, off(rA)` with displacement 0 = primary vptr install.
+                    if ra != 0
+                        && simm == 0
+                        && let Some(val) = reg[rd]
+                        && in_data(val)
+                    {
+                        anchors.insert(val);
+                    }
+                }
+                32..=35 | 40..=43 | 48..=51 => reg[rd] = None,
+                OP_X_FORM => {
+                    let xo = (instr >> 1) & 0x3FF;
+                    if xo != 444 && xo != 467 { reg[rd] = None; } // keep `or`(444=mr)/`mtspr`-ish
+                }
+                18 | 16 => {
+                    if (instr & 1) != 0 {
+                        for r in 0..=12 { reg[r] = None; }
+                    }
+                }
+                _ => {}
+            }
+            pc = pc.wrapping_add(4);
+        }
+    }
+    anchors
+}
+
 /// Synthetic name for an RTTI-stripped vtable, derived from a stable hash of
 /// the sorted method-PC list. Two vtables with identical method ordering
 /// collapse to the same anonymous name.
@@ -385,6 +628,112 @@ mod tests {
        assert!(!vtables[0].rtti_present);
    }

+    #[test]
+    fn anchor_recovers_vtable_with_nonfn_head() {
+        // A vtable whose head has a null + an unrecognised word, so the
+        // contiguity scan (≥3 contiguous known fns) fragments it. The anchor
+        // (from a ctor vptr-write) must recover the whole table from its base.
+        let image_base = 0x82000000u32;
+        let rdata_va = 0x1000u32;
+        let text_va = 0x2000u32;
+        let rdata_size = 0x40u32;
+        let text_size = 0x100u32;
+        let total = (text_va + text_size) as usize;
+        let mut pe = vec![0u8; total];
+
+        let f0 = image_base + text_va;
+        let f1 = image_base + text_va + 0x10;
+        let f2 = image_base + text_va + 0x20;
+        // Slots: [null, NONFN(0xDEAD), f0, f1, f2]
+        let slots: [u32; 5] = [0, 0xDEADBEEF, f0, f1, f2];
+        for (i, val) in slots.iter().enumerate() {
+            pe[rdata_va as usize + i * 4..rdata_va as usize + (i + 1) * 4]
+                .copy_from_slice(&val.to_be_bytes());
+        }
+
+        let sections = vec![
+            PeSection {
+                name: ".rdata".into(),
+                virtual_address: rdata_va,
+                virtual_size: rdata_size,
+                raw_offset: rdata_va,
+                raw_size: rdata_size,
+                flags: 0x4000_0040,
+            },
+            PeSection {
+                name: ".text".into(),
+                virtual_address: text_va,
+                virtual_size: text_size,
+                raw_offset: text_va,
+                raw_size: text_size,
+                flags: 0x6000_0020,
+            },
+        ];
+        let mut function_starts = std::collections::BTreeSet::new();
+        for &pc in &[f0, f1, f2] { function_starts.insert(pc); }
+
+        // Without an anchor: the head gap (null + nonfn = 2 slots) means the
+        // contiguous run is only [f0,f1,f2]=3 starting at +0x08, so pass-1
+        // still finds it but at the WRONG base (0x...1008), not the true base.
+        let no_anchor = analyze(&pe, image_base, &sections, &function_starts);
+        assert!(
+            !no_anchor.iter().any(|v| v.address == image_base + rdata_va),
+            "without anchor the table is not recovered at its true base"
+        );
+
+        // With the anchor at the true base:
+        let mut anchors = std::collections::BTreeSet::new();
+        anchors.insert(image_base + rdata_va);
+        let with_anchor =
+            analyze_with_anchors(&pe, image_base, &sections, &function_starts, &anchors);
+        let v = with_anchor
+            .iter()
+            .find(|v| v.address == image_base + rdata_va)
+            .expect("anchor must recover vtable at its true base");
+        // length spans through f2 (slot 4): 5 slots.
+        assert_eq!(v.length, 5, "table spans null/nonfn head through last fn");
+        assert_eq!(v.methods[2], f0);
+        assert_eq!(v.methods[4], f2);
+    }
+
+    #[test]
+    fn scan_vptr_write_constants_finds_ctor_store() {
+        // Encode a ctor: addis r11,r0,0x8201; addi r11,r11,lo; stw r11,0(r31)
+        // installing vtable base 0x8200A908 into this+0.
+        let image_base = 0x82000000u32;
+        let ctor = 0x82001000u32;
+        let mut pe = vec![0u8; 0x4000];
+        // Lay out a tiny .rdata at 0x...A900 so the constant lands in-range.
+        let vt_base = 0x8200A908u32; // 0x82010000 - 22264
+        let addis = (15u32 << 26) | (11 << 21) | (0 << 16) | 0x8201;
+        let lo = (vt_base & 0xFFFF) as i16; // -22264
+        let addi = (14u32 << 26) | (11 << 21) | (0 << 16) | ((lo as u16) as u32);
+        // addi r11,r0,lo would set r11=lo (sign-extended); we need addis+addi
+        // chained. Re-encode addis into r11 from r0, then addi r11,r11,lo.
+        let addi2 = (14u32 << 26) | (11 << 21) | (11 << 16) | ((lo as u16) as u32);
+        let stw = (36u32 << 26) | (11 << 21) | (31 << 16) | 0; // stw r11,0(r31)
+        let at = (ctor - image_base) as usize;
+        pe[at..at + 4].copy_from_slice(&addis.to_be_bytes());
+        pe[at + 4..at + 8].copy_from_slice(&addi2.to_be_bytes());
+        pe[at + 8..at + 12].copy_from_slice(&stw.to_be_bytes());
+        let _ = addi;
+
+        let sections = vec![PeSection {
+            name: ".rdata".into(),
+            virtual_address: 0xA900,
+            virtual_size: 0x200,
+            raw_offset: 0xA900,
+            raw_size: 0x200,
+            flags: 0x4000_0040,
+        }];
+        let mut funcs: std::collections::BTreeMap<u32, (u32, bool)> = std::collections::BTreeMap::new();
+        funcs.insert(ctor, (ctor + 0x40, false));
+        let anchors = scan_vptr_write_constants(
+            &pe, image_base, &funcs, &sections, &std::collections::HashSet::new(),
+        );
+        assert!(anchors.contains(&vt_base), "ctor vptr store must yield anchor {vt_base:#x}, got {anchors:?}");
+    }
+
    #[test]
    fn rejects_2_method_run() {
        let image_base = 0x82000000u32;
--- a/crates/xenia-app/src/main.rs
+++ b/crates/xenia-app/src/main.rs
@@ -415,6 +415,18 @@ fn main() -> Result<()> {
    // metrics summary.
    let _obs = observability::init(&config)?;

+    // Env-gated indirect-dispatch recorder (off by default). Resolve the env
+    // once here; a scope guard dumps the recorded (call_site -> target) table
+    // at end-of-run no matter how the run terminates.
+    xenia_cpu::dispatch_rec::install();
+    struct DispatchRecGuard;
+    impl Drop for DispatchRecGuard {
+        fn drop(&mut self) {
+            xenia_cpu::dispatch_rec::dump();
+        }
+    }
+    let _dispatch_rec_guard = DispatchRecGuard;
+
    let result = match cli.command {
        Commands::Disasm { path, count, at } => cmd_disasm(&path, count, at),
        Commands::Exec {
@@ -1437,6 +1449,45 @@ fn cmd_exec_inner(
    // atoms that live inside `kernel.gpu.mmio`.
    mem.add_mmio_region(xenia_gpu::build_mmio_region(kernel.gpu.mmio()));

+    // apu stage 1 — reserve the 320-entry XMA context array and install the
+    // `0x7FEA0000` register aperture (mirrors canary's `XmaDecoder::Setup`).
+    //
+    // Physical placement: canary stores a *physical* address in
+    // `ContextArrayAddress` (reg 0x600) — `PhysicalHeap::GetPhysicalAddress`
+    // returns `va - heap_base` (== `va & 0x1FFFFFFF` for the physical heaps).
+    // Our memory model is FLAT: `translate_virtual` is a raw `membase + addr`
+    // with no separate physical-window mirror, and `translate_physical` masks
+    // `& 0x1FFFFFFF` — so the two only coincide for low (`< 0x2000_0000`) VAs.
+    // `heap_alloc` returns a `0x40000000`-region VA, so `va & 0x1FFFFFFF` would
+    // be 0 (disagreeing with the context pointers `XMACreateContext` hands out
+    // at `va + i*64`). The guest reads `ContextArrayAddress` and indexes it as
+    // `base + i*64`; for that to equal the pointers it dereferences, the base
+    // MUST equal the VA. So we advertise `va` itself — self-consistent in the
+    // flat model (the guest reaches every context through the same VA space).
+    // Stage 3's decoder will read the context structs via this VA directly
+    // (not via `translate_physical`). The 20480-byte buffer is page-committed
+    // by `heap_alloc`, so the guest never faults writing the 64-byte structs.
+    {
+        let array_size =
+            (xenia_apu::XMA_CONTEXT_COUNT as u32) * xenia_apu::XMA_CONTEXT_SIZE; // 320 * 64
+        match kernel.heap_alloc(array_size, &mem) {
+            Some(va) => {
+                let phys = va; // flat model: array base == VA (see note above)
+                kernel.xma.lock().unwrap().init(va, phys);
+                mem.add_mmio_region(xenia_apu::build_mmio_region(kernel.xma.clone()));
+                tracing::info!(
+                    va = format_args!("{va:#010x}"),
+                    phys = format_args!("{phys:#010x}"),
+                    size = format_args!("{array_size:#x}"),
+                    "xma: context array reserved + 0x7FEA0000 aperture installed"
+                );
+            }
+            None => {
+                tracing::error!("xma: failed to reserve context array (heap exhausted)");
+            }
+        }
+    }
+
    // Install the initial guest thread on HW slot 0. The thread handle we
    // hand the scheduler isn't visible to any guest API yet, but joiners
    // (XThreadWait-style) will see it via `find_by_tid`.
@@ -2354,6 +2405,14 @@ fn coord_post_round(
        let _ = gpu_runs;
    }

+    // APU stage 3 — pump the XMA decoder on the CPU thread, same cadence as the
+    // inline GPU. Deterministic (no host thread / clock): for each context with
+    // a pending kick it runs one Work() pass, decoding the guest's XMA packets
+    // into PCM and writing it back into the output ring + context struct.
+    if let Ok(mut xma) = kernel.xma.try_lock() {
+        xma.decode_pending(mem);
+    }
+
    if kernel.gpu.has_pending_interrupts() {
        for pi in kernel.gpu.take_pending_interrupts() {
            // Canary `ExecutePacketType3_INTERRUPT` dispatches the callback
@@ -2445,7 +2504,7 @@ fn worker_prologue(
    stats: &mut ExecStats,
 ) -> PrologueOutcome {
    use xenia_cpu::interpreter::{step_cached, StepResult};
-    use xenia_cpu::scheduler::{HwState, INITIAL_GUEST_TID};
+    use xenia_cpu::scheduler::{BlockReason, HwState, INITIAL_GUEST_TID};
    use xenia_cpu::PpcOpcode;
    const LR_HALT: u32 = xenia_cpu::context::LR_HALT_SENTINEL as u32;

@@ -2492,12 +2551,26 @@ fn worker_prologue(

    // 1) Halt-sentinel check (per HW thread).
    if pc == LR_HALT {
+        // iterate-4A: the async audio-callback injection (`try_inject_audio_callback`)
+        // sets `interrupts.saved`/`injected_ref` to the dedicated audio
+        // worker and runs REAL guest code (`sub_824D29F0`, which calls
+        // blocking kernel APIs) across MANY scheduler rounds before
+        // returning to `LR_HALT_SENTINEL`. The restore must fire only when
+        // the thread that *actually* reached the sentinel is the injected
+        // worker itself — i.e. the FULL `ThreadRef` (hw_id AND idx), which
+        // `scheduler.current` holds after `begin_slot_visit`. Matching on
+        // `hw_id` alone let ANY OTHER thread sharing that HW slot reach
+        // `LR_HALT` and consume the audio worker's `saved` slot; when the
+        // worker later truly returned, `saved` was already `None`, the
+        // guard failed, and control fell through to "marking exited" — the
+        // worker was removed and every subsequent audio callback dropped
+        // (`find_by_handle` skips Exited threads). The graphics ISR path is
+        // fully synchronous (`dispatch_graphics_interrupts` restores inline
+        // and never leaves `interrupts.saved` set across rounds), so this
+        // restore lifecycle is exclusive to audio and graphics is
+        // unaffected.
        let injected_here = kernel.interrupts.saved.is_some()
-            && kernel
-                .interrupts
-                .injected_ref
-                .map(|r| r.hw_id == hw_id)
-                == Some(true);
+            && kernel.interrupts.injected_ref == kernel.scheduler.current;
        if injected_here
            && let Some(saved) = kernel.interrupts.saved.take()
        {
@@ -2509,17 +2582,64 @@ fn worker_prologue(
            kernel.interrupts.delivered += 1;
            let source = saved.source;
            let mut restore_outcome = "ready";
+
+            // iterate-4A: the dedicated audio worker's canonical resting
+            // state is "parked on its synthetic handle, awaiting the next
+            // callback injection". The callback (`sub_824D29F0`) runs real
+            // guest code that can be flipped `ServicingIrq -> Ready` by an
+            // intervening `wake_ref` (a `KeSetEvent`/timeout targeting the
+            // worker as a waiter mid-callback). The old re-block heuristic
+            // only re-parked when the state was *still* `ServicingIrq`, so
+            // such a wake left the worker `Ready` — it then ran its thread
+            // entry to the `LR_HALT` sentinel, EXITED, and every subsequent
+            // callback dropped (`find_by_handle` skips Exited workers),
+            // wedging the intro-video audio→XMA pipeline. When this restore
+            // is an audio callback (`source == INTERRUPT_SOURCE_AUDIO`),
+            // re-park the worker UNCONDITIONALLY onto its synthetic
+            // park-handle so it survives to receive the next fire. (Graphics
+            // restores keep the `ServicingIrq`-only re-block: a graphics
+            // victim is a borrowed real thread, not a parked worker, and the
+            // old behavior there must stay byte-identical.)
+            if source == xenia_kernel::INTERRUPT_SOURCE_AUDIO {
+                let worker_handle =
+                    kernel.scheduler.thread(target_ref).thread_handle;
+                let index = worker_handle.and_then(|h| {
+                    kernel
+                        .xaudio
+                        .worker_handles
+                        .iter()
+                        .position(|wh| *wh == Some(h))
+                });
+                if let Some(index) = index {
+                    let park = xenia_kernel::xaudio::synthetic_park_handle(index);
+                    kernel.scheduler.thread_mut(target_ref).state =
+                        HwState::Blocked(BlockReason::WaitAny {
+                            handles: vec![park],
+                            deadline: None,
+                        });
+                    restore_outcome = "reparked";
+                } else if let HwState::ServicingIrq(reason) =
+                    kernel.scheduler.thread(target_ref).state.clone()
+                {
+                    // Fallback (handle unresolved): preserve the legacy
+                    // ServicingIrq-only re-block rather than leak the worker.
+                    kernel.scheduler.thread_mut(target_ref).state =
+                        HwState::Blocked(reason);
+                    restore_outcome = "reblocked";
+                }
+            } else {
                let current = kernel.scheduler.thread(target_ref).state.clone();
                if let HwState::ServicingIrq(reason) = current {
                    kernel.scheduler.thread_mut(target_ref).state =
                        HwState::Blocked(reason);
                    restore_outcome = "reblocked";
                }
+            }
            tracing::debug!(
                source,
                hw_id,
                outcome = restore_outcome,
-                "graphics interrupt: callback returned"
+                "interrupt: callback returned"
            );
            return PrologueOutcome::Continue;
        }
@@ -2905,12 +3025,55 @@ fn run_superblock(

    let budget = superblock_budget();

-    // Probe / mem-watch / debugger-hook modes need per-block-entry
-    // observability; in those modes never chain (run exactly one block,
-    // identical to the pre-superblock behaviour). The block-cache fast
-    // path is only entered when hooks/DB are off anyway, but a probe or
-    // mem-watch can be armed alongside it.
-    let chain_allowed = !kernel.any_probe_active() && !mem.has_mem_watch();
+    // Heisenbug fix (toolkit audit, 2026-06-21): probes and mem-watch are
+    // OBSERVE-ONLY diagnostics and must NOT change guest scheduling. The
+    // previous implementation disabled superblock chaining whenever any
+    // probe / mem-watch was armed (so the per-block-entry observation in
+    // `worker_prologue` was reached for every block). But chaining is what
+    // determines thread interleaving, so arming a probe perturbed the
+    // schedule — it starved the movie/XMV subsystem so it never reached the
+    // video state, making the probe useless on exactly the code we most
+    // needed to observe (`XENIA_SUPERBLOCK_BUDGET=1` reproduces the same
+    // starvation, confirming chaining is the lever).
+    //
+    // The fix fires the SAME per-block-entry observation INSIDE the chain
+    // loop, at every chained block's entry PC (see `fire_block_entry_probes`
+    // below), so chaining — and therefore scheduling — is byte-identical
+    // whether or not a probe is armed. `chain_allowed` no longer depends on
+    // the probe/mem-watch state.
+    //
+    // `wants_hooks()` (the interactive debugger / breakpoint path) still
+    // forces the per-instruction path in `worker_prologue` and never reaches
+    // `run_superblock`, so the only remaining reason to never chain here is
+    // the explicit budget==1 reproduction request.
+    let chain_allowed = budget > 1;
+
+    // Per-block-entry diagnostic observation, replicating exactly what
+    // `worker_prologue` does at the first block of a slot visit:
+    //   1. the four `fire_*_if_match` probe helpers (read-only; each
+    //      re-checks its own armed set against the live ctx PC), and
+    //   2. the mem-watch writer-context publish, so a watched store that
+    //      fires mid-block is attributed to the CORRECT chained block's
+    //      entry PC / LR (matching the single-block reporting granularity)
+    //      instead of the stale superblock-entry PC.
+    // The closure is a pure function of the live scheduler context; the
+    // caller must ensure `ctx.pc` equals the block-entry PC before calling.
+    let probe_hw_id = wc.hw_id;
+    let fire_block_entry_probes =
+        |kernel: &mut xenia_kernel::KernelState, mem: &xenia_memory::GuestMemory| {
+            let hw_id = probe_hw_id;
+            if kernel.any_probe_active() {
+                kernel.fire_ctor_probe_if_match(hw_id, mem);
+                kernel.fire_branch_probe_if_match(hw_id);
+                kernel.fire_audit_pc_probe_if_match(hw_id, mem);
+                kernel.fire_lr_trace_if_match(hw_id);
+            }
+            if mem.has_mem_watch() {
+                let ctx = kernel.scheduler.ctx(hw_id);
+                let tid_w = kernel.scheduler.tid(hw_id).unwrap_or(0);
+                xenia_memory::set_writer_ctx(tid_w, ctx.pc, ctx.lr as u32);
+            }
+        };

    let mut block_ptr = first_block_ptr;
    let mut pc_before = first_pc_before;
@@ -2955,11 +3118,20 @@ fn run_superblock(
            break (result, block_ptr, pc_before);
        }

-        // Chain: build/fetch the next block. Re-borrows `wc.block_cache`,
-        // which invalidates the previous `block_ptr` — but we've already
-        // finished using it (only `sync_sensitive`/diagnostics were read,
-        // above), so the raw-pointer aliasing rule is respected.
+        // Chain into the next block. `ctx.pc` now equals `next_pc` (the
+        // chained block's entry), so fire the per-block-entry observation
+        // BEFORE stepping it — identical to what `worker_prologue` did at
+        // the first block. This keeps the probe firing at EVERY armed
+        // block-entry while leaving the chaining decision (and thus the
+        // schedule) untouched. The first block was already observed by the
+        // prologue, so we only observe the newly-chained blocks here.
        pc_before = next_pc;
+        fire_block_entry_probes(kernel, mem);
+
+        // Build/fetch the next block. Re-borrows `wc.block_cache`, which
+        // invalidates the previous `block_ptr` — but we've already finished
+        // using it (only `sync_sensitive`/diagnostics were read, above), so
+        // the raw-pointer aliasing rule is respected.
        block_ptr = wc.block_cache.lookup_or_build(next_pc, mem) as *const _;
    };

@@ -2993,6 +3165,15 @@ fn run_execution(
    let mut stats = ExecStats::default();
    let _ = quiet; // retained for future per-kind suppression

+    // APU stage 3 — give the XMA decoder a stable pointer to the guest memory
+    // mapping `run_execution` runs against, so the kick MMIO write can run
+    // Work() synchronously (canary `!use_dedicated_xma_thread` semantics: the
+    // game observes the updated context the instant its kick store retires).
+    // `mem` outlives this call for both the headless and UI paths.
+    if let Ok(mut xma) = kernel.xma.lock() {
+        xma.set_memory(mem);
+    }
+
    // `--halt-on-deadlock` CLI flag OR `XENIA_HALT_ON_DEADLOCK=1|true` env var:
    // when the scheduler next hits a hard deadlock (every live HW thread
    // blocked on a handle wait with no pending timer) we bail out with a
@@ -4093,10 +4274,18 @@ fn dump_thread_diagnostic(
            ),
        }
    }
-    if quiet {
-        return;
-    }
    use xenia_kernel::objects::KernelObject;
+
+    // Toolkit-audit fix (2026-06-21): only the ALWAYS-ON thread/waiter table
+    // is suppressed by `--quiet`. The explicitly-armed diagnostics below
+    // (`--trace-handles`, `--trace-handles-focus`, `--dump-addr`) are
+    // requested output — arming the flag IS the user asking for it — and
+    // were previously swallowed by the blanket `if quiet { return; }`, which
+    // made the documented headless `--quiet` invocation silently drop every
+    // handle/focus/dump report. They are each self-gated below (on
+    // `audit.enabled` / `!audit.focus.is_empty()` / `!dump_addrs.is_empty()`)
+    // so they only print when actually armed.
+    if !quiet {
    println!("\n=== Thread diagnostics ===");
    for (hw_id, slot) in kernel.scheduler.slots.iter().enumerate() {
        if slot.runqueue.is_empty() {
@@ -4193,6 +4382,7 @@ fn dump_thread_diagnostic(
            println!("    cs={:#010x} waiters(tid)={:?}", cs_ptr, tids);
        }
    }
+    } // end `if !quiet` (always-on thread/waiter table)

    // Audit trails (only when --trace-handles flipped the flag). For each
    // tracked handle, emit a compact block: kind, creator, and the bounded
@@ -4868,8 +5058,23 @@ fn cmd_dis(
    // pointer-validity oracle; runs over .rdata + .data.
    let function_starts: std::collections::BTreeSet<u32> =
        func_analysis.functions.keys().copied().collect();
-    let vtables = xenia_analysis::vtables::analyze(
-        &pe_image, base, &sections, &function_starts,
+    // Anchor discovery: recover vtable bases from constructor vptr-write
+    // stores so a vtable with non-function head words (null / pure-virtual /
+    // unrecognised thunk slots) isn't fragmented away by the contiguity
+    // heuristic. (Fixes e.g. the XMV engine vtable 0x8200a908.)
+    let vptr_anchor_funcs: std::collections::BTreeMap<u32, (u32, bool)> = func_analysis
+        .functions
+        .iter()
+        .map(|(&s, fi)| (s, (fi.end, fi.is_saverestore)))
+        .collect();
+    let vptr_block_boundaries: std::collections::HashSet<u32> =
+        xref_result.labels.keys().copied().collect();
+    let vtable_anchors = xenia_analysis::vtables::scan_vptr_write_constants(
+        &pe_image, base, &vptr_anchor_funcs, &sections, &vptr_block_boundaries,
+    );
+    info!(vtable_anchors = vtable_anchors.len(), "vptr-write anchor scan complete");
+    let vtables = xenia_analysis::vtables::analyze_with_anchors(
+        &pe_image, base, &sections, &function_starts, &vtable_anchors,
    );
    let rtti_count = vtables.iter().filter(|v| v.rtti_present).count();
    info!(
--- a/crates/xenia-app/tests/golden/sylpheed_n50m.json
+++ b/crates/xenia-app/tests/golden/sylpheed_n50m.json
@@ -1,9 +1,9 @@
 {
-  "instructions": 50000110,
-  "imports": 243387,
+  "instructions": 50000200,
+  "imports": 189264,
  "unimpl": 0,
-  "draws": 1279,
-  "swaps": 260,
+  "draws": 768,
+  "swaps": 157,
  "unique_render_targets": 2,
  "shader_blobs_live": 6,
  "texture_cache_entries": 1
--- a/crates/xenia-apu/Cargo.toml
+++ b/crates/xenia-apu/Cargo.toml
@@ -6,5 +6,12 @@ license.workspace = true

 [dependencies]
 xenia-types = { workspace = true }
+xenia-memory = { workspace = true }
 tracing = { workspace = true }
 thiserror = { workspace = true }
+
+# Raw FFmpeg FFI for the XMA2 audio decoder (stage 3). The system libs are
+# FFmpeg 6.1 (libavcodec 60), so we pin the matching `6.1` series. The `build`
+# feature regenerates bindings via bindgen against the installed headers, so
+# the FFI matches the distro FFmpeg exactly. We only need avcodec + avutil.
+ffmpeg-sys-next = { version = "6.1", default-features = false, features = ["avcodec"] }
--- a/crates/xenia-apu/src/lib.rs
+++ b/crates/xenia-apu/src/lib.rs
@@ -1,3 +1,9 @@
+pub mod xma;
+pub mod xma2_codec;
+pub mod xma_decode;
+
+pub use xma::{build_mmio_region, XmaDecoder, XMA_CONTEXT_COUNT, XMA_CONTEXT_SIZE};
+
 /// Audio processing unit stub. Logging only for now.
 pub struct AudioSystem {
    pub enabled: bool,
--- a/crates/xenia-apu/src/xma.rs
+++ b/crates/xenia-apu/src/xma.rs
@@ -0,0 +1,932 @@
+//! Register-mapped XMA context system — a faithful port of xenia-canary's
+//! `apu/xma_decoder.cc` context-array + MMIO machinery, MINUS the audio
+//! decoder itself (stage 3).
+//!
+//! The guest allocates XMA contexts via `XMACreateContext` (which hands back a
+//! pointer into our 320-entry context array in physical guest memory), writes
+//! the 64-byte `XMA_CONTEXT_DATA` struct, then *kicks* decode by writing the
+//! per-context bit into the `0x7FEA0000` register aperture. This module
+//! satisfies all of that without faulting and records which contexts the guest
+//! kicked; stage 3 will consume the recorded `pending` flags to actually
+//! produce PCM.
+//!
+//! ## Byte order
+//! The guest accesses the aperture byte-reversed (`stwbrx`/`lwbrx`), so the raw
+//! `u32` our MMIO boundary delivers is byte-swapped relative to the logical
+//! register value — exactly the situation canary handles with `xe::byte_swap`.
+//! So `write_register` swaps the incoming value before decoding and the
+//! register file holds host-order values; `read_register` swaps on the way out.
+//! This was proven empirically: the guest's Clear writes arrive as
+//! `0x01000000`/`0x02000000`/`0x04000000`, i.e. byte-reversed `1`/`2`/`4`,
+//! targeting contexts 0/1/2 (which it had just allocated) — NOT 24/25/26. The
+//! register-index math (`(addr & 0xFFFF) / 4`) is the same as canary's.
+
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::sync::{Arc, Mutex};
+
+use xenia_memory::access::MemoryAccess;
+use xenia_memory::{GuestMemory, MmioRegion};
+
+use crate::xma_decode::{self, ContextDecodeState, XmaContextData};
+
+/// Size in bytes of an `XMA_CONTEXT_DATA` struct (canary `xma_context.h`).
+/// Stage 1 does not decode the fields — only the stride matters.
+pub const XMA_CONTEXT_SIZE: u32 = 64;
+/// Number of XMA contexts the hardware exposes (canary `kContextCount`).
+pub const XMA_CONTEXT_COUNT: usize = 320;
+
+/// Register aperture base (guest physical). Canary maps the XMA decoder at
+/// `0x7FEA0000` in `XmaDecoder::Setup`.
+pub const APERTURE_BASE: u32 = 0x7FEA_0000;
+/// Mask used by `MmioRegion::contains` so any `0x7FEAxxxx` address hits.
+pub const APERTURE_MASK: u32 = 0xFFFF_0000;
+/// Total aperture size in bytes (the low 16-bit register window).
+pub const APERTURE_SIZE: u32 = 0x0001_0000;
+
+// ----- Register indices (canary `XmaRegister` enum / xma_register_table.inc).
+// Indices are dword indices: byte offset = index * 4.
+
+/// `ContextArrayAddress` — physical base of the context array. byte 0x1800.
+const REG_CONTEXT_ARRAY_ADDRESS: u32 = 0x600;
+/// `CurrentContextIndex` — the context the HW is currently servicing. byte
+/// 0x1818. Polled by the guest; we rotate it so a poll never sticks.
+const REG_CURRENT_CONTEXT_INDEX: u32 = 0x606;
+
+/// First of the 10 `ContextNKick` registers (`Context0Kick`..`Context9Kick`).
+/// byte 0x1940. Each register's bit N kicks context `base*32 + N`.
+const REG_CONTEXT_KICK_BASE: u32 = 0x650;
+/// First of the 10 `ContextNLock` registers. byte 0x1A40.
+const REG_CONTEXT_LOCK_BASE: u32 = 0x690;
+/// First of the 10 `ContextNClear` registers. byte 0x1A80.
+const REG_CONTEXT_CLEAR_BASE: u32 = 0x6A0;
+/// Each group spans 10 registers (320 contexts / 32-per-register).
+const CONTEXT_GROUP_LEN: u32 = 10;
+
+/// Number of 32-bit words backing the register file. The highest index we
+/// touch is `0x6A9`; round up generously so any in-aperture index is in range
+/// (64 KB aperture / 4).
+const REGISTER_FILE_WORDS: usize = 0x4000;
+
+/// Register-mapped XMA context array. Owns the allocation bitmap, the register
+/// file, and the per-context kick/enable bookkeeping that stage 3 consumes.
+pub struct XmaDecoder {
+    /// Guest virtual address of the context array (handed back by
+    /// `allocate_context`).
+    context_array_guest_va: u32,
+    /// Physical address stored into `ContextArrayAddress` (reg 0x600).
+    context_array_phys: u32,
+    /// 320-slot allocation bitmap, one bit per context (`bitmap[i>>6]` bit
+    /// `i & 63`). A set bit means *allocated*.
+    bitmap: [u64; (XMA_CONTEXT_COUNT + 63) / 64],
+    /// Flat register file, host-native values. Indexed by dword register index.
+    registers: Vec<u32>,
+    /// Per-context "decode requested" flag, set on Kick, cleared on Clear.
+    /// Stage 3 drains this to produce PCM.
+    pending: [bool; XMA_CONTEXT_COUNT],
+    /// Per-context enable flag. A Lock disables; a Kick (re-)enables. Mirrors
+    /// canary's "is_enabled" notion loosely — exact decode semantics are
+    /// stage 3.
+    enabled: [bool; XMA_CONTEXT_COUNT],
+    /// Total kicks observed (diagnostic; lets headless logs show progress).
+    kick_count: u64,
+    /// Rotating value served for `CurrentContextIndex` reads so a guest poll
+    /// can't spin forever on a fixed value. Atomic so the read path can stay
+    /// `&self`.
+    current_context_index: AtomicU32,
+    /// Per-context stage-3 decode state (FFmpeg codec, staged PCM frame, ring
+    /// bookkeeping). Lazily populated as contexts are decoded.
+    decode_state: Vec<ContextDecodeState>,
+    /// Total PCM bytes written to guest output buffers (diagnostic).
+    pcm_bytes_total: u64,
+    /// Stable pointer to the guest memory mapping, captured at init. Used to run
+    /// `Work()` SYNCHRONOUSLY inside the kick MMIO write — exactly as canary's
+    /// default `!use_dedicated_xma_thread` path does (`context.Work()` right in
+    /// `WriteRegister`), so the game sees the updated context the instant its
+    /// kick store retires. The mapping lives for the whole run; decode is
+    /// deterministic and happens on the CPU thread, so this is determinism-safe.
+    mem_ptr: *const GuestMemory,
+}
+
+// The decoder is owned behind an `Arc<Mutex<..>>` and only ever touched from the
+// CPU scheduler thread (kick MMIO writes + the per-round pump). The raw `mem_ptr`
+// is a stable whole-run mapping; access is single-threaded.
+unsafe impl Send for XmaDecoder {}
+
+impl XmaDecoder {
+    /// Construct an un-initialized decoder. Call [`Self::init`] once the
+    /// context-array memory has been reserved.
+    pub fn new() -> Self {
+        Self {
+            context_array_guest_va: 0,
+            context_array_phys: 0,
+            bitmap: [0; (XMA_CONTEXT_COUNT + 63) / 64],
+            registers: vec![0; REGISTER_FILE_WORDS],
+            pending: [false; XMA_CONTEXT_COUNT],
+            enabled: [false; XMA_CONTEXT_COUNT],
+            kick_count: 0,
+            current_context_index: AtomicU32::new(0),
+            decode_state: (0..XMA_CONTEXT_COUNT).map(|_| ContextDecodeState::new()).collect(),
+            pcm_bytes_total: 0,
+            mem_ptr: std::ptr::null(),
+        }
+    }
+
+    /// Capture the stable guest-memory mapping so the kick MMIO path can run
+    /// `Work()` synchronously (canary semantics). Call once at boot, after the
+    /// final `mem` is in its long-lived location.
+    pub fn set_memory(&mut self, mem: &GuestMemory) {
+        self.mem_ptr = mem as *const GuestMemory;
+    }
+
+    /// Wire in the context-array addresses (after the app reserves the buffer)
+    /// and publish the physical base into `ContextArrayAddress` (reg 0x600),
+    /// exactly as canary's `XmaDecoder::Setup` does.
+    pub fn init(&mut self, context_array_guest_va: u32, context_array_phys: u32) {
+        self.context_array_guest_va = context_array_guest_va;
+        self.context_array_phys = context_array_phys;
+        self.registers[REG_CONTEXT_ARRAY_ADDRESS as usize] = context_array_phys;
+        tracing::info!(
+            va = format_args!("{context_array_guest_va:#010x}"),
+            phys = format_args!("{context_array_phys:#010x}"),
+            "xma: context array initialized"
+        );
+    }
+
+    /// Acquire a free context slot and return its guest pointer
+    /// (`context_array_guest_va + i*64`), or 0 if all 320 slots are in use.
+    /// Mirrors canary's `XmaDecoder::AllocateContext`.
+    pub fn allocate_context(&mut self) -> u32 {
+        for i in 0..XMA_CONTEXT_COUNT {
+            let word = i >> 6;
+            let bit = 1u64 << (i & 63);
+            if self.bitmap[word] & bit == 0 {
+                self.bitmap[word] |= bit;
+                let ptr = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE;
+                tracing::info!(
+                    index = i,
+                    ptr = format_args!("{ptr:#010x}"),
+                    "xma: allocate_context"
+                );
+                return ptr;
+            }
+        }
+        tracing::warn!("xma: allocate_context — all {} slots in use", XMA_CONTEXT_COUNT);
+        0
+    }
+
+    /// Free the slot backing `guest_ptr`. Mirrors canary's
+    /// `XmaDecoder::ReleaseContext`. Out-of-range / unaligned pointers are
+    /// ignored (the guest never faults).
+    pub fn release_context(&mut self, guest_ptr: u32) {
+        if guest_ptr < self.context_array_guest_va {
+            return;
+        }
+        let offset = guest_ptr - self.context_array_guest_va;
+        let i = (offset / XMA_CONTEXT_SIZE) as usize;
+        if i >= XMA_CONTEXT_COUNT {
+            return;
+        }
+        let word = i >> 6;
+        let bit = 1u64 << (i & 63);
+        self.bitmap[word] &= !bit;
+        self.pending[i] = false;
+        self.enabled[i] = false;
+        tracing::info!(index = i, ptr = format_args!("{guest_ptr:#010x}"), "xma: release_context");
+    }
+
+    /// Read a register. Returns the stored value, except `CurrentContextIndex`
+    /// (0x606) which rotates `0..XMA_CONTEXT_COUNT` per read so a polling guest
+    /// always sees forward progress. Out-of-range indices read 0.
+    pub fn read_register(&self, reg_index: u32) -> u32 {
+        // The guest accesses the aperture byte-reversed (`lwbrx`), so the
+        // register file holds host-order values and we swap on the way out —
+        // exactly as canary's `ReadRegister` returns `xe::byte_swap(reg)`.
+        let host = if reg_index == REG_CURRENT_CONTEXT_INDEX {
+            // Rotate mod context count on each read so a poll never sticks.
+            let prev = self.current_context_index.fetch_add(1, Ordering::Relaxed);
+            prev % XMA_CONTEXT_COUNT as u32
+        } else {
+            self.registers.get(reg_index as usize).copied().unwrap_or(0)
+        };
+        host.swap_bytes()
+    }
+
+    /// Write a register, then apply the side-effect of the Kick / Lock / Clear
+    /// register groups. Each register in a group covers 32 contexts; bit N maps
+    /// to `context_id = (reg_index - group_base) * 32 + N`. We iterate set bits
+    /// with `trailing_zeros` + clear-lowest-bit, mirroring canary's
+    /// `std::countr_zero` loop. The incoming value is byte-swapped first (see
+    /// below).
+    pub fn write_register(&mut self, reg_index: u32, value: u32) {
+        // The guest writes the aperture byte-reversed (`stwbrx`); undo it so the
+        // register file holds host-order values, mirroring canary's
+        // `WriteRegister` which does `value = xe::byte_swap(value)` first. Proven
+        // by the guest's Clear writes (`0x01000000` == context 0, not 24).
+        let value = value.swap_bytes();
+        if let Some(slot) = self.registers.get_mut(reg_index as usize) {
+            *slot = value;
+        }
+
+        if (REG_CONTEXT_KICK_BASE..REG_CONTEXT_KICK_BASE + CONTEXT_GROUP_LEN).contains(&reg_index) {
+            let base = (reg_index - REG_CONTEXT_KICK_BASE) * 32;
+            let mut bits = value;
+            while bits != 0 {
+                let b = bits.trailing_zeros();
+                bits &= bits - 1;
+                let context_id = (base + b) as usize;
+                if context_id < XMA_CONTEXT_COUNT {
+                    self.pending[context_id] = true;
+                    self.enabled[context_id] = true;
+                    self.kick_count += 1;
+                    tracing::debug!(
+                        context_id,
+                        kick_count = self.kick_count,
+                        "xma: kick (decode requested)"
+                    );
+                    // Canary `!use_dedicated_xma_thread`: run Work() right here so
+                    // the game observes the updated context when its kick store
+                    // retires. Safe — `mem_ptr` is a stable whole-run mapping and
+                    // we're on the CPU thread.
+                    if !self.mem_ptr.is_null() {
+                        let mem: &GuestMemory = unsafe { &*self.mem_ptr };
+                        self.enabled[context_id] = false;
+                        self.work_one(mem, context_id);
+                    }
+                }
+            }
+        } else if (REG_CONTEXT_LOCK_BASE..REG_CONTEXT_LOCK_BASE + CONTEXT_GROUP_LEN)
+            .contains(&reg_index)
+        {
+            let base = (reg_index - REG_CONTEXT_LOCK_BASE) * 32;
+            let mut bits = value;
+            while bits != 0 {
+                let b = bits.trailing_zeros();
+                bits &= bits - 1;
+                let context_id = (base + b) as usize;
+                if context_id < XMA_CONTEXT_COUNT {
+                    self.enabled[context_id] = false;
+                    tracing::debug!(context_id, "xma: lock (context disabled)");
+                }
+            }
+        } else if (REG_CONTEXT_CLEAR_BASE..REG_CONTEXT_CLEAR_BASE + CONTEXT_GROUP_LEN)
+            .contains(&reg_index)
+        {
+            let base = (reg_index - REG_CONTEXT_CLEAR_BASE) * 32;
+            let mut bits = value;
+            while bits != 0 {
+                let b = bits.trailing_zeros();
+                bits &= bits - 1;
+                let context_id = (base + b) as usize;
+                if context_id < XMA_CONTEXT_COUNT {
+                    self.pending[context_id] = false;
+                    self.enabled[context_id] = false;
+                    tracing::debug!(context_id, "xma: clear (context state reset)");
+                }
+            }
+        }
+    }
+
+    /// Total kicks observed so far (diagnostic; stage 3 will consume `pending`).
+    pub fn kick_count(&self) -> u64 {
+        self.kick_count
+    }
+
+    /// Whether context `i` has a pending (un-serviced) kick. Stage-3 hook.
+    pub fn is_pending(&self, i: usize) -> bool {
+        self.pending.get(i).copied().unwrap_or(false)
+    }
+
+    /// Total PCM bytes the decoder has written to guest output buffers.
+    pub fn pcm_bytes_total(&self) -> u64 {
+        self.pcm_bytes_total
+    }
+
+    /// Stage-3 entry point. Called once per scheduler round from the CPU
+    /// thread's per-round coordinator. For each context with a pending kick,
+    /// run one `Work()` pass (canary `XmaContextNew::Work`): read the context,
+    /// decode available input into PCM, drain into the output ring, and write
+    /// the decoder-owned fields back. Deterministic — no host thread, no clock.
+    pub fn decode_pending(&mut self, mem: &GuestMemory) {
+        if self.context_array_guest_va == 0 {
+            return;
+        }
+        for i in 0..XMA_CONTEXT_COUNT {
+            if !self.pending[i] || !self.enabled[i] {
+                continue;
+            }
+            // Canary `Work` clears is_enabled at entry; a fresh kick re-enables.
+            self.enabled[i] = false;
+            self.work_one(mem, i);
+        }
+    }
+
+    /// One `Work()` pass for context `i`. Faithful to canary's orchestration but
+    /// uses the mainline xma2 decoder (whole-packet driven) for the actual
+    /// frame decode in place of canary's per-frame `Decode()`.
+    fn work_one(&mut self, mem: &GuestMemory, i: usize) {
+        let ctx_va = self.context_array_guest_va + (i as u32) * XMA_CONTEXT_SIZE;
+        let data = XmaContextData::read(mem, ctx_va);
+        let initial = data;
+
+        if data.output_buffer_valid == 0 {
+            return;
+        }
+
+        let mut data = data;
+        self.decode_into_output(mem, i, ctx_va, &mut data, &initial);
+    }
+
+    /// Decode available input packets into PCM and drain into the output ring.
+    fn decode_into_output(
+        &mut self,
+        mem: &GuestMemory,
+        i: usize,
+        ctx_va: u32,
+        data: &mut XmaContextData,
+        initial: &XmaContextData,
+    ) {
+        use xma_decode::*;
+
+        let output_capacity = data.output_buffer_block_count * OUTPUT_BYTES_PER_BLOCK;
+        if output_capacity == 0 {
+            return;
+        }
+        let out_backing = xma_phys_to_backing(data.output_buffer_ptr);
+        let mut write_off = data.output_buffer_write_offset * OUTPUT_BYTES_PER_BLOCK;
+        let read_off = data.output_buffer_read_offset * OUTPUT_BYTES_PER_BLOCK;
+
+        // write_count: free space in the ring from write to read.
+        let free_bytes = ring_write_count(read_off, write_off, output_capacity);
+        self.decode_state[i].remaining_subframe_blocks_in_output =
+            (free_bytes / OUTPUT_BYTES_PER_BLOCK) as i32;
+
+        let effective_sdc = data.subframe_decode_count.max(1);
+        let min_blocks = effective_sdc as i32 + data.output_buffer_padding as i32;
+
+        if min_blocks > self.decode_state[i].remaining_subframe_blocks_in_output {
+            // No room — write back unchanged and wait for the game to drain.
+            store_merged_pub(mem, ctx_va, data, initial);
+            return;
+        }
+
+        let mut produced_any = false;
+
+        // Ensure codec configured for current rate/channels.
+        let rate = sample_rate_hz(data.sample_rate);
+        let channels = if data.is_stereo != 0 { 2 } else { 1 };
+        self.ensure_codec(i, rate, channels);
+
+        // Main decode loop: while there's output ring room and valid input.
+        loop {
+            if self.decode_state[i].remaining_subframe_blocks_in_output < min_blocks {
+                break;
+            }
+
+            // If we still have undrained subframes from a prior decode, consume
+            // them first (canary Consume before next Decode).
+            if self.decode_state[i].current_frame_remaining_subframes == 0 {
+                // Need a fresh decoded frame. Pull from the codec, feeding input
+                // packets as required.
+                if !self.produce_frame(mem, i, data) {
+                    break;
+                }
+            }
+
+            // Consume: write up to `effective_sdc` subframes (256B blocks) of
+            // the staged raw_frame into the output ring.
+            let total_subframes =
+                ((BYTES_PER_FRAME_CHANNEL / OUTPUT_BYTES_PER_BLOCK) << data.is_stereo) as u8;
+            let remaining = self.decode_state[i].current_frame_remaining_subframes;
+            let to_write = remaining.min(effective_sdc as u8);
+            let frame_read_off = (total_subframes - remaining) as usize * OUTPUT_BYTES_PER_BLOCK as usize;
+            let nbytes = to_write as u32 * OUTPUT_BYTES_PER_BLOCK;
+
+            // Write into the output ring (handle wrap).
+            let raw = &self.decode_state[i].raw_frame;
+            write_off = ring_write(
+                mem,
+                out_backing,
+                output_capacity,
+                write_off,
+                &raw[frame_read_off..frame_read_off + nbytes as usize],
+            );
+            self.pcm_bytes_total += nbytes as u64;
+            produced_any = true;
+
+            let headroom = if remaining - to_write == 0 {
+                data.output_buffer_padding as i32
+            } else {
+                0
+            };
+            self.decode_state[i].remaining_subframe_blocks_in_output -=
+                to_write as i32 + headroom;
+            self.decode_state[i].current_frame_remaining_subframes -= to_write;
+        }
+
+        // Writeback offsets.
+        data.output_buffer_write_offset = write_off / OUTPUT_BYTES_PER_BLOCK;
+
+        if self.decode_state[i].remaining_subframe_blocks_in_output == 0
+            && write_off == read_off
+        {
+            data.output_buffer_valid = 0;
+        }
+        if !produced_any && !data.is_any_input_buffer_valid() {
+            data.output_buffer_valid = 0;
+        }
+
+        store_merged_pub(mem, ctx_va, data, initial);
+    }
+
+    /// Configure (or reconfigure) the FFmpeg xma2 codec for this context.
+    fn ensure_codec(&mut self, i: usize, rate: u32, channels: u32) {
+        let st = &mut self.decode_state[i];
+        if st.codec.is_some() && st.codec_rate == rate && st.codec_channels == channels {
+            return;
+        }
+        match crate::xma2_codec::Xma2Codec::new(rate, channels) {
+            Ok(c) => {
+                st.codec = Some(c);
+                st.codec_rate = rate;
+                st.codec_channels = channels;
+                tracing::info!(ctx = i, rate, channels, "xma: xma2 codec configured");
+            }
+            Err(e) => {
+                tracing::error!(ctx = i, rate, channels, error = %e, "xma: xma2 codec init failed");
+                st.codec = None;
+            }
+        }
+    }
+
+    /// Produce one decoded 512-sample frame into `raw_frame` (interleaved S16BE).
+    ///
+    /// Input-consumption model (faithful to canary's packet/buffer contract).
+    ///
+    /// The mainline xma2 decoder consumes whole 2 KB packets via `send_packet`
+    /// and emits frames in bursts (internal FIFO + lookahead), so its intake
+    /// position can't be read per-frame. We therefore keep TWO cursors:
+    ///
+    ///  1. A private FFmpeg *feed* cursor (`feed_buffer`/`feed_packet_index`)
+    ///     that hands raw packets to FFmpeg only far enough ahead to keep the
+    ///     PCM queue stocked. This follows the same buffer ping-pong as the
+    ///     guest but is NOT what the guest observes.
+    ///  2. The guest-visible `input_buffer_read_offset`, advanced by exactly
+    ///     ONE compressed frame each time we emit a 512-sample frame to the
+    ///     guest — via `advance_read_offset_one_frame`, a faithful port of the
+    ///     offset arithmetic in canary's `Decode()`. This crosses packet and
+    ///     buffer boundaries (and fires SwapInputBuffer, clearing the drained
+    ///     buffer's valid bit) at canary's true per-frame cadence, which is
+    ///     what the WMV demuxer polls to refill ADV.wmv.
+    ///
+    /// Decoupling the two means FFmpeg's whole-packet burst framing no longer
+    /// freezes the guest-visible offset: the offset now tracks emitted output,
+    /// so the input buffer is consumed and swapped as the movie actually plays.
+    fn produce_frame(&mut self, mem: &GuestMemory, i: usize, data: &mut XmaContextData) -> bool {
+        use xma_decode::*;
+        let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 };
+        let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize;
+
+        // Top up FFmpeg's internal FIFO (and our queue) just enough to satisfy
+        // one frame, feeding raw packets via the private feed cursor.
+        if self.decode_state[i].pcm_queue.len() < frame_bytes {
+            self.feed_codec(mem, i, data);
+        }
+
+        // Pop exactly one 512-sample frame from the queue into raw_frame.
+        if self.decode_state[i].pcm_queue.len() < frame_bytes {
+            return false;
+        }
+        {
+            let st = &mut self.decode_state[i];
+            st.raw_frame.iter_mut().for_each(|b| *b = 0);
+            for b in st.raw_frame[..frame_bytes].iter_mut() {
+                *b = st.pcm_queue.pop_front().unwrap();
+            }
+            st.current_frame_remaining_subframes = (4u8) << data.is_stereo;
+        }
+
+        // We just emitted one frame to the guest — advance its visible read
+        // offset by one compressed frame at canary's cadence (may swap buffer).
+        self.advance_read_offset_one_frame(mem, data);
+        true
+    }
+
+    /// Feed raw 2 KB packets to FFmpeg from the private feed cursor until the
+    /// PCM queue holds at least one frame or the codec stops accepting input.
+    /// The feed cursor follows the guest's `current_buffer` ping-pong but keeps
+    /// its own packet index (`feed_packet_index`), so feeding ahead of the
+    /// guest-visible read offset is fine — the offset advances separately per
+    /// emitted frame.
+    fn feed_codec(&mut self, mem: &GuestMemory, i: usize, data: &XmaContextData) {
+        use xma_decode::*;
+        let channels = if data.is_stereo != 0 { 2u32 } else { 1u32 };
+        let frame_bytes = (BYTES_PER_FRAME_CHANNEL * channels) as usize;
+
+        // Re-sync the feed buffer to the guest's current buffer if the guest has
+        // swapped past us (the buffer we were feeding was consumed).
+        if self.decode_state[i].feed_buffer != data.current_buffer
+            && !data.is_input_buffer_valid(self.decode_state[i].feed_buffer)
+        {
+            self.decode_state[i].feed_buffer = data.current_buffer;
+            self.decode_state[i].feed_packet_index = 0;
+        }
+
+        const MAX_FEED: u32 = 8;
+        let mut fed = 0u32;
+        while self.decode_state[i].pcm_queue.len() < frame_bytes && fed < MAX_FEED {
+            let fb = self.decode_state[i].feed_buffer;
+            if !data.is_input_buffer_valid(fb) {
+                // Nothing to feed from this buffer; try the other if valid.
+                let other = fb ^ 1;
+                if data.is_input_buffer_valid(other) {
+                    self.decode_state[i].feed_buffer = other;
+                    self.decode_state[i].feed_packet_index = 0;
+                    continue;
+                }
+                break;
+            }
+            let pkt_count = data.input_buffer_packet_count(fb);
+            let pidx = self.decode_state[i].feed_packet_index;
+            if pidx >= pkt_count {
+                // Exhausted this buffer's packets at the feed cursor; advance to
+                // the other buffer if it's valid (it was refilled), else wait.
+                let other = fb ^ 1;
+                if data.is_input_buffer_valid(other) {
+                    self.decode_state[i].feed_buffer = other;
+                    self.decode_state[i].feed_packet_index = 0;
+                    continue;
+                }
+                break;
+            }
+            let backing = xma_phys_to_backing(data.input_buffer_address(fb));
+            let pkt_va = backing + pidx * BYTES_PER_PACKET;
+            let mut packet = vec![0u8; BYTES_PER_PACKET as usize];
+            mem.read_bytes(pkt_va, &mut packet);
+            let send_res = match self.decode_state[i].codec.as_mut() {
+                Some(codec) => codec.send_packet(&packet),
+                None => break,
+            };
+            match send_res {
+                Ok(()) => {
+                    self.decode_state[i].feed_packet_index += 1;
+                    fed += 1;
+                    self.drain_codec_frames(i);
+                }
+                // Decoder full — drain what it has and stop; re-offer this same
+                // packet next time (don't advance the feed cursor).
+                Err(ref e) if e == "EAGAIN" => {
+                    self.drain_codec_frames(i);
+                    break;
+                }
+                Err(e) => {
+                    tracing::warn!(ctx = i, error = %e, "xma: send_packet failed");
+                    break;
+                }
+            }
+        }
+    }
+
+    /// Pull all currently-available decoded frames from the codec and append
+    /// their interleaved S16BE PCM to the context's queue.
+    fn drain_codec_frames(&mut self, i: usize) {
+        loop {
+            let out = match self.decode_state[i].codec.as_mut() {
+                Some(c) => c.receive_frame(),
+                None => None,
+            };
+            let Some((nb, bytes)) = out else { break };
+            let st = &mut self.decode_state[i];
+            st.frames_decoded += 1;
+            if !st.first_frame_logged {
+                st.first_frame_logged = true;
+                tracing::info!(
+                    ctx = i,
+                    samples = nb,
+                    pcm_bytes = bytes.len(),
+                    "xma: first PCM frame decoded"
+                );
+            }
+            st.pcm_queue.extend(bytes);
+        }
+    }
+
+    /// Advance `input_buffer_read_offset` by exactly ONE compressed frame,
+    /// faithfully mirroring the offset arithmetic in canary's
+    /// `XmaContextNew::Decode` (frame-size parse + packet-boundary handling +
+    /// SwapInputBuffer when the buffer's packets are exhausted). Called once per
+    /// 512-sample frame we emit to the guest, so the guest-visible read offset
+    /// crosses packet/buffer boundaries at canary's true cadence — independent
+    /// of the mainline xma2 decoder's whole-packet burst framing. This is what
+    /// lets `input_buffer_0_valid` toggle and the WMV demuxer refill ADV.wmv.
+    fn advance_read_offset_one_frame(&mut self, mem: &GuestMemory, data: &mut XmaContextData) {
+        use xma_decode::*;
+
+        if !data.is_any_input_buffer_valid() {
+            return;
+        }
+        if !data.is_current_input_buffer_valid() {
+            self.swap_input_buffer(data);
+            if !data.is_current_input_buffer_valid() {
+                return;
+            }
+        }
+
+        // Clamp a header-region offset (canary's Dirt-2 guard).
+        if data.input_buffer_read_offset < BITS_PER_PACKET_HEADER {
+            data.input_buffer_read_offset = BITS_PER_PACKET_HEADER;
+        }
+
+        let pkt_count = data.current_input_buffer_packet_count();
+        let input_size = pkt_count * BYTES_PER_PACKET;
+        let Some(packet_index) = packet_number(input_size, data.input_buffer_read_offset) else {
+            return;
+        };
+        let buf_backing = xma_phys_to_backing(data.current_input_buffer_address());
+        let pkt_va = buf_backing + packet_index * BYTES_PER_PACKET;
+        let mut packet = vec![0u8; BYTES_PER_PACKET as usize];
+        mem.read_bytes(pkt_va, &mut packet);
+
+        let first_frame_offset = packet_frame_offset(&packet);
+        let mut relative_offset = data.input_buffer_read_offset % BITS_PER_PACKET;
+        if relative_offset < first_frame_offset {
+            // Tail of a split frame — skip to this packet's first frame.
+            data.input_buffer_read_offset =
+                packet_index * BITS_PER_PACKET + first_frame_offset;
+            relative_offset = first_frame_offset;
+        }
+
+        let skip_count = packet_skip_count(&packet);
+        // Full-packet skip (0xFF): no frames begin here — advance to the next
+        // packet that does, swapping the buffer if exhausted.
+        if skip_count == 0xFF {
+            let next_packet_index = packet_index + 1;
+            let next_off =
+                self.next_packet_read_offset(mem, data, next_packet_index, pkt_count);
+            if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER {
+                self.swap_input_buffer(data);
+            }
+            data.input_buffer_read_offset = next_off;
+            return;
+        }
+
+        let info = get_packet_info(&packet, relative_offset);
+        let packet_to_skip = (skip_count as u32) + 1;
+        let next_packet_index = packet_index + packet_to_skip;
+
+        // Frame size: clamp to the bits remaining in the packet stream (canary
+        // GetAmountOfBitsToRead over the (packet_index+1)*kBitsPerPacket stream).
+        let stream_remaining =
+            ((packet_index + 1) * BITS_PER_PACKET).saturating_sub(data.input_buffer_read_offset);
+        let frame_size = if info.current_frame_size == 0 {
+            // Split header we can't resolve from this packet alone; fall back to
+            // advancing past the rest of this packet so we don't stall.
+            stream_remaining
+        } else {
+            info.current_frame_size
+        };
+        let bits_to_copy = amount_of_bits_to_read(stream_remaining, frame_size);
+
+        if !info.is_last_frame_in_packet() {
+            let next_frame_offset =
+                (data.input_buffer_read_offset + bits_to_copy) % BITS_PER_PACKET;
+            data.input_buffer_read_offset =
+                packet_index * BITS_PER_PACKET + next_frame_offset;
+            return;
+        }
+
+        // Last frame in this packet: move to the next packet's first frame, or
+        // swap the input buffer if the packets are exhausted (canary's
+        // `next_packet_index >= current_input_packet_count`).
+        let mut next_off =
+            self.next_packet_read_offset(mem, data, next_packet_index, pkt_count);
+        if next_packet_index >= pkt_count || next_off == BITS_PER_PACKET_HEADER {
+            self.swap_input_buffer(data);
+        }
+        if next_off == BITS_PER_PACKET_HEADER && data.is_any_input_buffer_valid() {
+            // At the start of the next buffer: jump to its first frame offset.
+            let nb_backing = xma_phys_to_backing(data.current_input_buffer_address());
+            let mut hdr = [0u8; 4];
+            mem.read_bytes(nb_backing, &mut hdr);
+            let fo = packet_frame_offset(&hdr);
+            if fo <= MAX_FRAME_SIZE_IN_BITS {
+                next_off = fo;
+            }
+        }
+        data.input_buffer_read_offset = next_off;
+    }
+
+    /// Scan forward from `next_packet_index` (possibly into the *next* buffer)
+    /// for the next packet that begins a frame and return its bit offset, or
+    /// `BITS_PER_PACKET_HEADER` if none (canary `GetNextPacketReadOffset`).
+    fn next_packet_read_offset(
+        &self,
+        mem: &GuestMemory,
+        data: &XmaContextData,
+        next_packet_index: u32,
+        current_input_packet_count: u32,
+    ) -> u32 {
+        use xma_decode::*;
+        // Resolve which buffer the packet lives in (current or the other).
+        let (buffer_index, mut pidx) = if next_packet_index >= current_input_packet_count {
+            (data.current_buffer ^ 1, next_packet_index - current_input_packet_count)
+        } else {
+            (data.current_buffer, next_packet_index)
+        };
+        if !data.is_input_buffer_valid(buffer_index) {
+            return BITS_PER_PACKET_HEADER;
+        }
+        let addr = data.input_buffer_address(buffer_index);
+        if addr == 0 {
+            return BITS_PER_PACKET_HEADER;
+        }
+        let pkt_count = data.input_buffer_packet_count(buffer_index);
+        let backing = xma_phys_to_backing(addr);
+        while pidx < pkt_count {
+            let mut hdr = [0u8; 4];
+            mem.read_bytes(backing + pidx * BYTES_PER_PACKET, &mut hdr);
+            let fo = packet_frame_offset(&hdr);
+            if fo <= MAX_FRAME_SIZE_IN_BITS {
+                return pidx * BITS_PER_PACKET + fo;
+            }
+            pidx += 1;
+        }
+        BITS_PER_PACKET_HEADER
+    }
+
+    fn swap_input_buffer(&mut self, data: &mut XmaContextData) {
+        use xma_decode::*;
+        tracing::debug!(
+            from = data.current_buffer,
+            to = data.current_buffer ^ 1,
+            "xma: SwapInputBuffer (input buffer consumed)"
+        );
+        if data.current_buffer == 0 {
+            data.input_buffer_0_valid = 0;
+        } else {
+            data.input_buffer_1_valid = 0;
+        }
+        data.current_buffer ^= 1;
+        data.input_buffer_read_offset = BITS_PER_PACKET_HEADER;
+    }
+}
+
+impl Default for XmaDecoder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Build the [`MmioRegion`] for the XMA register aperture at `0x7FEA0000`.
+/// Mirrors the GPU's `build_region`: the closures lock the shared decoder,
+/// compute the dword register index, and dispatch to `read`/`write_register`.
+pub fn build_mmio_region(dec: Arc<Mutex<XmaDecoder>>) -> MmioRegion {
+    let read_dec = dec.clone();
+    let write_dec = dec;
+
+    MmioRegion {
+        base_address: APERTURE_BASE,
+        mask: APERTURE_MASK,
+        size: APERTURE_SIZE,
+        read_callback: Box::new(move |addr: u32| {
+            let reg_index = (addr & 0xFFFF) / 4;
+            read_dec.lock().unwrap().read_register(reg_index)
+        }),
+        write_callback: Box::new(move |addr: u32, value: u32| {
+            let reg_index = (addr & 0xFFFF) / 4;
+            write_dec.lock().unwrap().write_register(reg_index, value);
+        }),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn inited() -> XmaDecoder {
+        let mut d = XmaDecoder::new();
+        // Pick a plausible physical-window VA/phys pair.
+        d.init(0xA010_0000, 0x0010_0000);
+        d
+    }
+
+    /// The guest writes/reads the aperture byte-reversed; `wire(v)` is the raw
+    /// bus value the guest sends to mean host-order `v` (and what a read of a
+    /// host-order `v` returns). Equivalent to `lwbrx`/`stwbrx` semantics.
+    fn wire(v: u32) -> u32 {
+        v.swap_bytes()
+    }
+
+    /// (a) `allocate_context` hands back distinct, increasing pointers spaced by
+    /// the 64-byte stride, exhausts at 320, and `release_context` frees the slot.
+    #[test]
+    fn allocate_distinct_then_exhaust_then_release() {
+        let mut d = inited();
+        let first = d.allocate_context();
+        let second = d.allocate_context();
+        assert_eq!(first, 0xA010_0000);
+        assert_eq!(second, 0xA010_0000 + XMA_CONTEXT_SIZE);
+        assert!(second > first);
+
+        // Drain the remaining slots (2 already taken).
+        for _ in 0..(XMA_CONTEXT_COUNT - 2) {
+            assert_ne!(d.allocate_context(), 0);
+        }
+        // 321st allocation fails.
+        assert_eq!(d.allocate_context(), 0);
+
+        // Free the first slot and re-acquire it.
+        d.release_context(first);
+        assert_eq!(d.allocate_context(), first);
+    }
+
+    /// (b) A Kick to `Context0Kick` with host value `0b101` marks contexts 0
+    /// and 2. The guest sends it byte-reversed (`wire`).
+    #[test]
+    fn kick_context0_marks_correct_contexts() {
+        let mut d = inited();
+        d.write_register(REG_CONTEXT_KICK_BASE, wire(0b101));
+        assert!(d.is_pending(0));
+        assert!(!d.is_pending(1));
+        assert!(d.is_pending(2));
+        assert_eq!(d.kick_count(), 2);
+    }
+
+    /// (c) A Kick to `Context1Kick` (0x651) bit 0 maps to context_id 32.
+    #[test]
+    fn kick_context1_bit0_is_context_32() {
+        let mut d = inited();
+        d.write_register(REG_CONTEXT_KICK_BASE + 1, wire(0b1));
+        assert!(d.is_pending(32));
+        assert!(!d.is_pending(0));
+        assert_eq!(d.kick_count(), 1);
+    }
+
+    /// Regression for the byte-order fix: the guest's real Clear writes were
+    /// `0x01000000`/`0x02000000`/`0x04000000` (bytes-reversed `1`/`2`/`4`),
+    /// meaning contexts 0/1/2 — NOT 24/25/26. Verify the raw bus values decode
+    /// to the low contexts.
+    #[test]
+    fn byte_reversed_clear_targets_low_contexts() {
+        let mut d = inited();
+        for i in 0..3 {
+            d.write_register(REG_CONTEXT_KICK_BASE, wire(1 << i));
+        }
+        assert!(d.is_pending(0) && d.is_pending(1) && d.is_pending(2));
+        // The exact bus values observed from the guest.
+        d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0100_0000);
+        d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0200_0000);
+        d.write_register(REG_CONTEXT_CLEAR_BASE, 0x0400_0000);
+        assert!(!d.is_pending(0) && !d.is_pending(1) && !d.is_pending(2));
+    }
+
+    /// (d) `read_register(0x600)` returns the base byte-reversed (the guest
+    /// `lwbrx`-reverses it back to the host-order base on its side).
+    #[test]
+    fn context_array_address_reads_phys() {
+        let d = inited();
+        assert_eq!(
+            d.read_register(REG_CONTEXT_ARRAY_ADDRESS),
+            wire(0x0010_0000)
+        );
+    }
+
+    /// (e) `CurrentContextIndex` rotates on each read and wraps at the count
+    /// (values returned byte-reversed).
+    #[test]
+    fn current_context_index_rotates() {
+        let d = inited();
+        assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0));
+        assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(1));
+        assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(2));
+        // Advance to the wrap boundary.
+        for _ in 3..XMA_CONTEXT_COUNT as u32 {
+            d.read_register(REG_CURRENT_CONTEXT_INDEX);
+        }
+        // Next read wraps back to 0.
+        assert_eq!(d.read_register(REG_CURRENT_CONTEXT_INDEX), wire(0));
+    }
+
+    /// Clear must drop a previously-kicked pending flag.
+    #[test]
+    fn clear_resets_pending() {
+        let mut d = inited();
+        d.write_register(REG_CONTEXT_KICK_BASE, wire(0b1));
+        assert!(d.is_pending(0));
+        d.write_register(REG_CONTEXT_CLEAR_BASE, wire(0b1));
+        assert!(!d.is_pending(0));
+    }
+
+    /// The MMIO region routes a guest write at `BASE + 0x600*4` to reg 0x600
+    /// and a read back through the same byte address, applying the byte swap.
+    #[test]
+    fn mmio_region_round_trips_register() {
+        let dec = Arc::new(Mutex::new(inited()));
+        let region = build_mmio_region(dec.clone());
+        let kick_byte = APERTURE_BASE + REG_CONTEXT_KICK_BASE * 4;
+        (region.write_callback)(kick_byte, wire(0b1));
+        assert!(dec.lock().unwrap().is_pending(0));
+        // ContextArrayAddress read-back via the bus (byte-reversed).
+        let addr_byte = APERTURE_BASE + REG_CONTEXT_ARRAY_ADDRESS * 4;
+        assert_eq!((region.read_callback)(addr_byte), wire(0x0010_0000));
+    }
+}
--- a/crates/xenia-apu/src/xma2_codec.rs
+++ b/crates/xenia-apu/src/xma2_codec.rs
@@ -0,0 +1,217 @@
+//! Thin unsafe wrapper around the mainline FFmpeg `AV_CODEC_ID_XMA2` decoder.
+//!
+//! Unlike canary's vendored `XMAFRAMES` (one frame per packet, custom padding
+//! header), the distro xma2 decoder consumes whole 2 KB XMA2 packets
+//! (`block_align == 2048`), needs `extradata` declaring the channel/stream
+//! layout, and buffers samples internally across packets. We drive it with the
+//! guest's raw 2 KB packets and pull whatever 512-sample float-planar frames it
+//! emits, returning them as interleaved S16 big-endian PCM (canary `ConvertFrame`).
+
+use std::os::raw::c_int;
+use std::ptr;
+
+use ffmpeg_sys_next as ff;
+
+/// One xma2 decoder instance, configured for a fixed (sample_rate, channels).
+pub struct Xma2Codec {
+    codec: *const ff::AVCodec,
+    ctx: *mut ff::AVCodecContext,
+    frame: *mut ff::AVFrame,
+    packet: *mut ff::AVPacket,
+    extradata: Vec<u8>,
+    channels: u32,
+}
+
+// FFmpeg objects are not Send/Sync by default; the decoder is only ever touched
+// on the CPU scheduler thread (decode_pending), so this is sound for our use.
+unsafe impl Send for Xma2Codec {}
+
+impl Xma2Codec {
+    /// Build XMA2WAVEFORMATEX extradata (34 bytes) for a single XMA2 stream.
+    /// Layout (little-endian, per FFmpeg `xma_decode_init` / xma2defs.h):
+    ///   [0..2]  NumStreams (u16)         = 1
+    ///   [2..6]  ChannelMask (u32)        = mono/stereo mask
+    ///   [6..34] remaining XMA2WAVEFORMATEX fields (unused by the decoder)
+    fn build_extradata(channels: u32) -> Vec<u8> {
+        let mut e = vec![0u8; 34];
+        // NumStreams = 1
+        e[0..2].copy_from_slice(&1u16.to_le_bytes());
+        // ChannelMask: 0x3 (FL|FR) for stereo, 0x4 (FC) for mono.
+        let mask: u32 = if channels >= 2 { 0x3 } else { 0x4 };
+        e[2..6].copy_from_slice(&mask.to_le_bytes());
+        e
+    }
+
+    pub fn new(sample_rate: u32, channels: u32) -> Result<Self, String> {
+        unsafe {
+            let codec = ff::avcodec_find_decoder(ff::AVCodecID::AV_CODEC_ID_XMA2);
+            if codec.is_null() {
+                return Err("xma2 decoder not found in libavcodec".into());
+            }
+            let ctx = ff::avcodec_alloc_context3(codec);
+            if ctx.is_null() {
+                return Err("avcodec_alloc_context3 failed".into());
+            }
+
+            let mut extradata = Self::build_extradata(channels);
+            // FFmpeg requires extradata to be allocated with av_malloc and
+            // padded; copy our bytes into an av_malloc'd buffer.
+            let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize;
+            let raw = ff::av_mallocz(extradata.len() + pad) as *mut u8;
+            if raw.is_null() {
+                ff::avcodec_free_context(&mut (ctx as *mut _));
+                return Err("av_mallocz extradata failed".into());
+            }
+            ptr::copy_nonoverlapping(extradata.as_ptr(), raw, extradata.len());
+            (*ctx).extradata = raw;
+            (*ctx).extradata_size = extradata.len() as c_int;
+
+            (*ctx).sample_rate = sample_rate as c_int;
+            (*ctx).block_align = 2048;
+            ff::av_channel_layout_default(&mut (*ctx).ch_layout, channels as c_int);
+
+            let ret = ff::avcodec_open2(ctx, codec, ptr::null_mut());
+            if ret < 0 {
+                let mut ctxm = ctx;
+                ff::avcodec_free_context(&mut ctxm);
+                return Err(format!("avcodec_open2 failed: {}", av_err(ret)));
+            }
+
+            let frame = ff::av_frame_alloc();
+            let packet = ff::av_packet_alloc();
+            if frame.is_null() || packet.is_null() {
+                let mut ctxm = ctx;
+                ff::avcodec_free_context(&mut ctxm);
+                return Err("av_frame_alloc/av_packet_alloc failed".into());
+            }
+
+            // keep our Vec alive as the source of truth for length
+            extradata.shrink_to_fit();
+
+            Ok(Self {
+                codec,
+                ctx,
+                frame,
+                packet,
+                extradata,
+                channels,
+            })
+        }
+    }
+
+    pub fn channels(&self) -> u32 {
+        self.channels
+    }
+
+    /// Feed one raw 2 KB XMA2 packet (header + data) to the decoder. Returns the
+    /// number of bytes the decoder accepted (0 = buffered, needs no new packet
+    /// yet / EAGAIN). Decoded frames are pulled via [`receive_frame`].
+    pub fn send_packet(&mut self, packet: &[u8]) -> Result<(), String> {
+        unsafe {
+            // av_packet_from_data takes ownership of an av_malloc buffer; simpler
+            // to point at our own bytes via a stack packet with a padded copy.
+            let pad = ff::AV_INPUT_BUFFER_PADDING_SIZE as usize;
+            let buf = ff::av_malloc(packet.len() + pad) as *mut u8;
+            if buf.is_null() {
+                return Err("av_malloc packet failed".into());
+            }
+            ptr::copy_nonoverlapping(packet.as_ptr(), buf, packet.len());
+            ptr::write_bytes(buf.add(packet.len()), 0, pad);
+            ff::av_packet_unref(self.packet);
+            // Wrap buf so FFmpeg frees it.
+            let ret = ff::av_packet_from_data(self.packet, buf, packet.len() as c_int);
+            if ret < 0 {
+                ff::av_free(buf as *mut _);
+                return Err(format!("av_packet_from_data failed: {}", av_err(ret)));
+            }
+            let ret = ff::avcodec_send_packet(self.ctx, self.packet);
+            if ret == ff::AVERROR(ff::EAGAIN) {
+                // Decoder full — caller should drain frames first then retry.
+                return Err("EAGAIN".into());
+            }
+            if ret < 0 {
+                return Err(format!("avcodec_send_packet failed: {}", av_err(ret)));
+            }
+            Ok(())
+        }
+    }
+
+    /// Signal end-of-stream so the decoder flushes its internal FIFO.
+    pub fn send_eof(&mut self) {
+        unsafe {
+            let _ = ff::avcodec_send_packet(self.ctx, ptr::null());
+        }
+    }
+
+    /// Pull one decoded frame as interleaved S16 big-endian PCM, or None if the
+    /// decoder needs more input (EAGAIN) or is drained (EOF). Returns
+    /// (samples_per_channel, interleaved_s16be_bytes).
+    pub fn receive_frame(&mut self) -> Option<(u32, Vec<u8>)> {
+        unsafe {
+            let ret = ff::avcodec_receive_frame(self.ctx, self.frame);
+            if ret < 0 {
+                return None;
+            }
+            let nb = (*self.frame).nb_samples as u32;
+            if nb == 0 {
+                return None;
+            }
+            let ch = (*self.frame).ch_layout.nb_channels.max(1) as u32;
+            let out = convert_frame_planar_to_s16be(self.frame, ch, nb);
+            Some((nb, out))
+        }
+    }
+}
+
+impl Drop for Xma2Codec {
+    fn drop(&mut self) {
+        unsafe {
+            if !self.frame.is_null() {
+                ff::av_frame_free(&mut self.frame);
+            }
+            if !self.packet.is_null() {
+                ff::av_packet_free(&mut self.packet);
+            }
+            if !self.ctx.is_null() {
+                ff::avcodec_free_context(&mut self.ctx);
+            }
+            let _ = &self.codec;
+            let _ = &self.extradata;
+        }
+    }
+}
+
+/// Convert FFmpeg planar-float output to interleaved S16 big-endian PCM
+/// (faithful to canary `XmaContext::ConvertFrame`: saturate to [-1,1], scale by
+/// 2^15-1, byte-swap each sample). `channels` planes of `nb_samples` floats.
+unsafe fn convert_frame_planar_to_s16be(
+    frame: *mut ff::AVFrame,
+    channels: u32,
+    nb_samples: u32,
+) -> Vec<u8> {
+    const SCALE: f32 = ((1i32 << 15) - 1) as f32;
+    let mut out = Vec::with_capacity((nb_samples * channels * 2) as usize);
+    unsafe {
+        // extended_data[ch] points to a plane of f32 (AV_SAMPLE_FMT_FLTP).
+        let ext = (*frame).extended_data;
+        for i in 0..nb_samples as isize {
+            for ch in 0..channels as isize {
+                let plane = *ext.offset(ch) as *const f32;
+                let s = if plane.is_null() { 0.0 } else { *plane.offset(i) };
+                let clamped = s.clamp(-1.0, 1.0) * SCALE;
+                let v = clamped as i16;
+                out.extend_from_slice(&v.to_be_bytes());
+            }
+        }
+    }
+    out
+}
+
+fn av_err(code: c_int) -> String {
+    unsafe {
+        let mut buf = [0i8; ff::AV_ERROR_MAX_STRING_SIZE as usize];
+        ff::av_strerror(code, buf.as_mut_ptr(), buf.len());
+        let cstr = std::ffi::CStr::from_ptr(buf.as_ptr());
+        cstr.to_string_lossy().into_owned()
+    }
+}
--- a/crates/xenia-apu/src/xma_decode.rs
+++ b/crates/xenia-apu/src/xma_decode.rs
@@ -0,0 +1,690 @@
+//! Stage 3 — the real XMA2→PCM decoder.
+//!
+//! A faithful port of xenia-canary's `apu/xma_context_new.cc` decode pipeline
+//! (`Work`/`Decode`/`Consume`/`StoreContextMerged`), adapted to the *mainline*
+//! distro FFmpeg `AV_CODEC_ID_XMA2` decoder rather than canary's vendored
+//! `AV_CODEC_ID_XMAFRAMES`.
+//!
+//! ## Determinism
+//! There is no host decoder thread. [`super::xma::XmaDecoder::decode_pending`]
+//! is invoked from the CPU scheduler's per-round coordinator
+//! (`coord_post_round` in xenia-app). FFmpeg decode is itself deterministic
+//! (same input bytes → same PCM), so the lockstep golden stays reproducible.
+//!
+//! ## FFmpeg framing — why this differs from canary
+//! Canary feeds FFmpeg one *frame* at a time (it bit-extracts a single 512-
+//! sample frame from the guest packet stream and hands it to the vendored
+//! `XMAFRAMES` codec with a custom 1-byte padding header). The mainline
+//! `xma2` decoder does NOT have `XMAFRAMES`; instead it consumes whole 2 KB
+//! XMA2 *packets* (`block_align == 2048`), needs `extradata` declaring the
+//! stream/channel layout, and manages frame splitting + a per-stream sample
+//! FIFO internally. So this module keeps canary's *guest-facing* contract
+//! (the `XMA_CONTEXT_DATA` packet/frame bookkeeping, the 256-byte-block output
+//! ring buffer, the field writeback) but replaces canary's per-frame
+//! `Decode()` body with: feed the current 2 KB packet to the xma2 decoder,
+//! pull any 512-sample PCM frames it emits, convert them to interleaved S16BE,
+//! and stage them as the "raw frame" that `Consume()` drains into the output
+//! ring.
+//!
+//! See `xma2_codec.rs` for the unsafe FFmpeg wrapper.
+
+use std::collections::VecDeque;
+
+use xenia_memory::access::MemoryAccess;
+use xenia_memory::GuestMemory;
+
+use crate::xma2_codec::Xma2Codec;
+
+// ---- Constants (canary `XmaContext` / `XmaContextNew`).
+
+pub const BYTES_PER_PACKET: u32 = 2048;
+pub const BYTES_PER_PACKET_HEADER: u32 = 4;
+pub const BYTES_PER_PACKET_DATA: u32 = BYTES_PER_PACKET - BYTES_PER_PACKET_HEADER;
+pub const BITS_PER_PACKET: u32 = BYTES_PER_PACKET * 8;
+/// Canary `kBitsPerPacketHeader` (in the *new* context) is 32.
+pub const BITS_PER_PACKET_HEADER: u32 = 32;
+pub const BITS_PER_FRAME_HEADER: u32 = 15;
+
+pub const SAMPLES_PER_FRAME: u32 = 512;
+pub const BYTES_PER_SAMPLE: u32 = 2;
+pub const BYTES_PER_FRAME_CHANNEL: u32 = SAMPLES_PER_FRAME * BYTES_PER_SAMPLE; // 1024
+pub const OUTPUT_BYTES_PER_BLOCK: u32 = 256;
+pub const OUTPUT_MAX_SIZE_BYTES: u32 = 31 * OUTPUT_BYTES_PER_BLOCK;
+
+pub const MAX_FRAME_LENGTH: u32 = 0x7FFF;
+pub const MAX_FRAME_SIZE_IN_BITS: u32 = 0x4000 - BITS_PER_PACKET_HEADER;
+
+const ID_TO_SAMPLE_RATE: [u32; 4] = [24000, 32000, 44100, 48000];
+
+/// Project a bare-physical XMA buffer pointer (`0x0xxxxxxx`) to the host-backed
+/// guest VA used by the rest of the emulator. Identical formula to
+/// `xenia_gpu::physical_to_backing` for the physical window; the input/output
+/// buffer pointers in the context are always in the low physical window.
+#[inline]
+pub fn xma_phys_to_backing(p: u32) -> u32 {
+    0x4000_0000 | (p & 0x1FFF_FFFF)
+}
+
+// ---- XMA_CONTEXT_DATA (canary `xma_context.h`, 64 bytes, 16 dwords).
+//
+// Stored big-endian in guest memory. We load all 16 dwords (BE) and unpack the
+// bitfields exactly per the canary layout (bitfields pack LSB-first within each
+// host-order dword). All fields below are kept as plain integers.
+
+#[derive(Clone, Copy, Debug, Default)]
+pub struct XmaContextData {
+    // DWORD 0
+    pub input_buffer_0_packet_count: u32, // :12
+    pub loop_count: u32,                  // :8
+    pub input_buffer_0_valid: u32,        // :1
+    pub input_buffer_1_valid: u32,        // :1
+    pub output_buffer_block_count: u32,   // :5
+    pub output_buffer_write_offset: u32,  // :5
+    // DWORD 1
+    pub input_buffer_1_packet_count: u32, // :12
+    pub loop_subframe_start: u32,         // :2
+    pub loop_subframe_end: u32,           // :3
+    pub loop_subframe_skip: u32,          // :3
+    pub subframe_decode_count: u32,       // :4
+    pub output_buffer_padding: u32,       // :3
+    pub sample_rate: u32,                 // :2
+    pub is_stereo: u32,                   // :1
+    pub unk_dword_1_c: u32,               // :1
+    pub output_buffer_valid: u32,         // :1
+    // DWORD 2
+    pub input_buffer_read_offset: u32, // :26
+    pub error_status: u32,             // :5
+    pub error_set: u32,                // :1
+    // DWORD 3
+    pub loop_start: u32,          // :26
+    pub parser_error_status: u32, // :5
+    pub parser_error_set: u32,    // :1
+    // DWORD 4
+    pub loop_end: u32,        // :26
+    pub packet_metadata: u32, // :5
+    pub current_buffer: u32,  // :1
+    // DWORD 5..8
+    pub input_buffer_0_ptr: u32,
+    pub input_buffer_1_ptr: u32,
+    pub output_buffer_ptr: u32,
+    pub work_buffer_ptr: u32,
+    // DWORD 9
+    pub output_buffer_read_offset: u32, // :5
+    pub stop_when_done: u32,            // :1 (bit 30)
+    pub interrupt_when_done: u32,       // :1 (bit 31)
+}
+
+#[inline]
+fn bits(v: u32, shift: u32, width: u32) -> u32 {
+    (v >> shift) & ((1u32 << width) - 1)
+}
+
+impl XmaContextData {
+    /// Read the 64-byte context struct from guest VA `ctx_va` (already a VA,
+    /// not a physical ptr). Each dword is read big-endian via `read_u32`.
+    pub fn read(mem: &GuestMemory, ctx_va: u32) -> Self {
+        let mut d = [0u32; 16];
+        for (i, w) in d.iter_mut().enumerate() {
+            *w = mem.read_u32(ctx_va + (i as u32) * 4);
+        }
+        let mut c = Self::default();
+        // DWORD 0
+        c.input_buffer_0_packet_count = bits(d[0], 0, 12);
+        c.loop_count = bits(d[0], 12, 8);
+        c.input_buffer_0_valid = bits(d[0], 20, 1);
+        c.input_buffer_1_valid = bits(d[0], 21, 1);
+        c.output_buffer_block_count = bits(d[0], 22, 5);
+        c.output_buffer_write_offset = bits(d[0], 27, 5);
+        // DWORD 1
+        c.input_buffer_1_packet_count = bits(d[1], 0, 12);
+        c.loop_subframe_start = bits(d[1], 12, 2);
+        c.loop_subframe_end = bits(d[1], 14, 3);
+        c.loop_subframe_skip = bits(d[1], 17, 3);
+        c.subframe_decode_count = bits(d[1], 20, 4);
+        c.output_buffer_padding = bits(d[1], 24, 3);
+        c.sample_rate = bits(d[1], 27, 2);
+        c.is_stereo = bits(d[1], 29, 1);
+        c.unk_dword_1_c = bits(d[1], 30, 1);
+        c.output_buffer_valid = bits(d[1], 31, 1);
+        // DWORD 2
+        c.input_buffer_read_offset = bits(d[2], 0, 26);
+        c.error_status = bits(d[2], 26, 5);
+        c.error_set = bits(d[2], 31, 1);
+        // DWORD 3
+        c.loop_start = bits(d[3], 0, 26);
+        c.parser_error_status = bits(d[3], 26, 5);
+        c.parser_error_set = bits(d[3], 31, 1);
+        // DWORD 4
+        c.loop_end = bits(d[4], 0, 26);
+        c.packet_metadata = bits(d[4], 26, 5);
+        c.current_buffer = bits(d[4], 31, 1);
+        // DWORD 5..8
+        c.input_buffer_0_ptr = d[5];
+        c.input_buffer_1_ptr = d[6];
+        c.output_buffer_ptr = d[7];
+        c.work_buffer_ptr = d[8];
+        // DWORD 9
+        c.output_buffer_read_offset = bits(d[9], 0, 5);
+        c.stop_when_done = bits(d[9], 30, 1);
+        c.interrupt_when_done = bits(d[9], 31, 1);
+        c
+    }
+
+    /// Repack the bitfields back into the 16 dwords (host order). Only the
+    /// decoder-owned fields differ from what was read; callers use
+    /// [`store_merged`] to write back without clobbering game-owned fields.
+    fn pack(&self) -> [u32; 16] {
+        let mut d = [0u32; 16];
+        d[0] = (self.input_buffer_0_packet_count & 0xFFF)
+            | ((self.loop_count & 0xFF) << 12)
+            | ((self.input_buffer_0_valid & 1) << 20)
+            | ((self.input_buffer_1_valid & 1) << 21)
+            | ((self.output_buffer_block_count & 0x1F) << 22)
+            | ((self.output_buffer_write_offset & 0x1F) << 27);
+        d[1] = (self.input_buffer_1_packet_count & 0xFFF)
+            | ((self.loop_subframe_start & 0x3) << 12)
+            | ((self.loop_subframe_end & 0x7) << 14)
+            | ((self.loop_subframe_skip & 0x7) << 17)
+            | ((self.subframe_decode_count & 0xF) << 20)
+            | ((self.output_buffer_padding & 0x7) << 24)
+            | ((self.sample_rate & 0x3) << 27)
+            | ((self.is_stereo & 1) << 29)
+            | ((self.unk_dword_1_c & 1) << 30)
+            | ((self.output_buffer_valid & 1) << 31);
+        d[2] = (self.input_buffer_read_offset & 0x3FF_FFFF)
+            | ((self.error_status & 0x1F) << 26)
+            | ((self.error_set & 1) << 31);
+        d[3] = (self.loop_start & 0x3FF_FFFF)
+            | ((self.parser_error_status & 0x1F) << 26)
+            | ((self.parser_error_set & 1) << 31);
+        d[4] = (self.loop_end & 0x3FF_FFFF)
+            | ((self.packet_metadata & 0x1F) << 26)
+            | ((self.current_buffer & 1) << 31);
+        d[5] = self.input_buffer_0_ptr;
+        d[6] = self.input_buffer_1_ptr;
+        d[7] = self.output_buffer_ptr;
+        d[8] = self.work_buffer_ptr;
+        d[9] = (self.output_buffer_read_offset & 0x1F)
+            | ((self.stop_when_done & 1) << 30)
+            | ((self.interrupt_when_done & 1) << 31);
+        d
+    }
+
+    pub fn is_input_buffer_valid(&self, idx: u32) -> bool {
+        if idx == 0 {
+            self.input_buffer_0_valid != 0
+        } else {
+            self.input_buffer_1_valid != 0
+        }
+    }
+    pub fn is_current_input_buffer_valid(&self) -> bool {
+        self.is_input_buffer_valid(self.current_buffer)
+    }
+    pub fn is_any_input_buffer_valid(&self) -> bool {
+        self.input_buffer_0_valid != 0 || self.input_buffer_1_valid != 0
+    }
+    pub fn input_buffer_address(&self, idx: u32) -> u32 {
+        if idx == 0 {
+            self.input_buffer_0_ptr
+        } else {
+            self.input_buffer_1_ptr
+        }
+    }
+    pub fn current_input_buffer_address(&self) -> u32 {
+        self.input_buffer_address(self.current_buffer)
+    }
+    pub fn input_buffer_packet_count(&self, idx: u32) -> u32 {
+        if idx == 0 {
+            self.input_buffer_0_packet_count
+        } else {
+            self.input_buffer_1_packet_count
+        }
+    }
+    pub fn current_input_buffer_packet_count(&self) -> u32 {
+        self.input_buffer_packet_count(self.current_buffer)
+    }
+}
+
+/// Merge decoder-owned fields back into guest memory (canary `StoreContextMerged`).
+/// Re-reads the current context (game may have raced an update), overwrites only
+/// the fields the decoder owns, and writes all 16 dwords back BE.
+fn store_merged(
+    mem: &GuestMemory,
+    ctx_va: u32,
+    data: &XmaContextData,
+    initial: &XmaContextData,
+) {
+    let mut fresh = XmaContextData::read(mem, ctx_va);
+    // DWORD 0
+    fresh.loop_count = data.loop_count;
+    fresh.output_buffer_write_offset = data.output_buffer_write_offset;
+    if initial.input_buffer_0_valid != 0 && data.input_buffer_0_valid == 0 {
+        fresh.input_buffer_0_valid = 0;
+    }
+    if initial.input_buffer_1_valid != 0 && data.input_buffer_1_valid == 0 {
+        fresh.input_buffer_1_valid = 0;
+    }
+    // DWORD 1
+    if initial.output_buffer_valid != 0 && data.output_buffer_valid == 0 {
+        fresh.output_buffer_valid = 0;
+    }
+    // DWORD 2
+    fresh.input_buffer_read_offset = data.input_buffer_read_offset;
+    fresh.error_status = data.error_status;
+    // DWORD 4
+    fresh.current_buffer = data.current_buffer;
+    // DWORD 9
+    fresh.output_buffer_read_offset = data.output_buffer_read_offset;
+
+    let d = fresh.pack();
+    for (i, w) in d.iter().enumerate() {
+        mem.write_u32(ctx_va + (i as u32) * 4, *w);
+    }
+}
+
+/// Public wrapper for [`store_merged`] (called from the orchestrator in xma.rs).
+pub fn store_merged_pub(
+    mem: &GuestMemory,
+    ctx_va: u32,
+    data: &XmaContextData,
+    initial: &XmaContextData,
+) {
+    store_merged(mem, ctx_va, data, initial);
+}
+
+/// Free byte count in a ring buffer from `write_off` to `read_off`
+/// (canary `RingBuffer::write_count`).
+pub fn ring_write_count(read_off: u32, write_off: u32, capacity: u32) -> u32 {
+    if read_off == write_off {
+        capacity
+    } else if write_off < read_off {
+        read_off - write_off
+    } else {
+        (capacity - write_off) + read_off
+    }
+}
+
+/// Write `bytes` into the guest ring buffer at `backing + write_off`, wrapping
+/// at `capacity`. Returns the new write offset (canary `RingBuffer::Write`).
+pub fn ring_write(
+    mem: &GuestMemory,
+    backing: u32,
+    capacity: u32,
+    write_off: u32,
+    bytes: &[u8],
+) -> u32 {
+    let count = (bytes.len() as u32).min(capacity);
+    if count == 0 {
+        return write_off;
+    }
+    if write_off + count < capacity {
+        mem.write_bytes(backing + write_off, &bytes[..count as usize]);
+        write_off + count
+    } else {
+        let left = capacity - write_off;
+        mem.write_bytes(backing + write_off, &bytes[..left as usize]);
+        let right = count - left;
+        mem.write_bytes(backing, &bytes[left as usize..(left + right) as usize]);
+        right
+    }
+}
+
+// ---- BitStream (port of canary `base/bit_stream.cc`). Big-endian source.
+
+pub struct BitStream<'a> {
+    buf: &'a [u8],
+    offset_bits: usize,
+    size_bits: usize,
+}
+
+impl<'a> BitStream<'a> {
+    pub fn new(buf: &'a [u8], size_bits: usize) -> Self {
+        Self { buf, offset_bits: 0, size_bits }
+    }
+    pub fn offset_bits(&self) -> usize {
+        self.offset_bits
+    }
+    pub fn set_offset(&mut self, off: usize) {
+        self.offset_bits = off.min(self.size_bits);
+    }
+    pub fn advance(&mut self, n: usize) {
+        self.set_offset(self.offset_bits + n);
+    }
+    pub fn bits_remaining(&self) -> usize {
+        self.size_bits - self.offset_bits
+    }
+    /// Peek up to 57 bits (canary contract). Reads 8 bytes BE then shifts.
+    pub fn peek(&self, num_bits: usize) -> u64 {
+        debug_assert!(num_bits <= 57);
+        // offset_bytes = min(offset>>3, (size-64)>>3), matching canary so an
+        // 8-byte load near the buffer end stays in range.
+        let max_byte = if self.size_bits >= 64 {
+            (self.size_bits - 64) >> 3
+        } else {
+            0
+        };
+        let offset_bytes = (self.offset_bits >> 3).min(max_byte);
+        let rel = self.offset_bits - (offset_bytes << 3);
+        let mut tmp = [0u8; 8];
+        let avail = self.buf.len().saturating_sub(offset_bytes).min(8);
+        tmp[..avail].copy_from_slice(&self.buf[offset_bytes..offset_bytes + avail]);
+        let mut value = u64::from_be_bytes(tmp);
+        value >>= 64 - (rel + num_bits);
+        value &= (1u64 << num_bits) - 1;
+        value
+    }
+    pub fn read(&mut self, num_bits: usize) -> u64 {
+        let v = self.peek(num_bits);
+        self.advance(num_bits);
+        v
+    }
+    /// Copy `num_bits` from the stream into `dest` (bit-packed, MSB-first within
+    /// each byte). Returns the starting bit offset within the first byte
+    /// (canary returns `rel_offset_bits` — the frame's intra-byte alignment).
+    pub fn copy(&mut self, dest: &mut [u8], num_bits: usize) -> usize {
+        let offset_bytes = self.offset_bits >> 3;
+        let rel = self.offset_bits - (offset_bytes << 3);
+        let mut bits_left = num_bits;
+        let mut out = 0usize;
+
+        if rel != 0 {
+            let bits = self.peek(8 - rel) as u8;
+            let clear_mask = !(((1u8 << rel) - 1)) as u8;
+            dest[out] &= clear_mask;
+            dest[out] |= bits;
+            bits_left -= 8 - rel;
+            self.advance(8 - rel);
+            out += 1;
+        }
+        if bits_left >= 8 {
+            let nbytes = bits_left / 8;
+            let src_off = (self.offset_bits >> 3).min(self.buf.len());
+            let copy = nbytes.min(self.buf.len().saturating_sub(src_off));
+            dest[out..out + copy]
+                .copy_from_slice(&self.buf[src_off..src_off + copy]);
+            out += nbytes;
+            self.advance(nbytes * 8);
+            bits_left -= nbytes * 8;
+        }
+        if bits_left != 0 {
+            let mut b = self.peek(bits_left) as u8;
+            b <<= 8 - bits_left;
+            let clear_mask = ((1u16 << bits_left) - 1) as u8;
+            dest[out] &= clear_mask;
+            dest[out] |= b;
+            self.advance(bits_left);
+        }
+        rel
+    }
+}
+
+// ---- XMA packet header helpers (canary `xma_helpers.h`).
+
+#[inline]
+pub fn packet_frame_count(packet: &[u8]) -> u8 {
+    packet[0] >> 2
+}
+#[inline]
+pub fn packet_metadata(packet: &[u8]) -> u8 {
+    packet[2] & 0x7
+}
+#[inline]
+pub fn is_packet_xma2(packet: &[u8]) -> bool {
+    packet_metadata(packet) == 1
+}
+#[inline]
+pub fn packet_skip_count(packet: &[u8]) -> u8 {
+    packet[3]
+}
+/// First frame offset in bits (canary `GetPacketFrameOffset`): a 15-bit value
+/// across bytes 0..2, plus the 32-bit header.
+#[inline]
+pub fn packet_frame_offset(packet: &[u8]) -> u32 {
+    let val = (((packet[0] as u32 & 0x3) << 13)
+        | ((packet[1] as u32) << 5)
+        | ((packet[2] as u32) >> 3))
+        & 0xFFFF;
+    val + 32
+}
+
+/// Sample-rate id → Hz.
+pub fn sample_rate_hz(id: u32) -> u32 {
+    ID_TO_SAMPLE_RATE[id.min(3) as usize]
+}
+
+// ---- Packet-walk for faithful input-offset advance (canary `GetPacketInfo`,
+// `GetNextPacketReadOffset`, and the offset arithmetic at the tail of
+// `XmaContextNew::Decode`). These let us advance `input_buffer_read_offset` one
+// *frame* at a time at canary's exact cadence — independent of the mainline
+// xma2 decoder's whole-packet/burst framing — so the offset crosses packet and
+// buffer boundaries (and triggers SwapInputBuffer) at the true input-drain
+// rate the guest's WMV demuxer polls.
+
+/// Info about the frame at a given bit offset within a packet (canary
+/// `kPacketInfo` / `GetPacketInfo`). `frame_count_` is the number of frames
+/// that begin in the packet; `current_frame_size_` is the compressed bit size
+/// of the frame at `frame_offset` (0 if it can't be resolved within this
+/// packet — a split header).
+#[derive(Default, Clone, Copy)]
+pub struct PacketInfo {
+    pub frame_count: u32,
+    pub current_frame: u32,
+    pub current_frame_size: u32,
+}
+
+impl PacketInfo {
+    pub fn is_last_frame_in_packet(&self) -> bool {
+        self.current_frame + 1 == self.frame_count
+    }
+}
+
+/// Faithful port of canary `XmaContextNew::GetPacketInfo`.
+pub fn get_packet_info(packet: &[u8], frame_offset: u32) -> PacketInfo {
+    let mut info = PacketInfo::default();
+    let first_frame_offset = packet_frame_offset(packet);
+    let mut stream = BitStream::new(packet, BITS_PER_PACKET as usize);
+    stream.set_offset(first_frame_offset as usize);
+
+    // Split frame from previous packet.
+    if frame_offset < first_frame_offset {
+        info.current_frame = 0;
+        info.current_frame_size = first_frame_offset - frame_offset;
+    }
+
+    loop {
+        if stream.bits_remaining() < BITS_PER_FRAME_HEADER as usize {
+            break;
+        }
+        let frame_size = stream.peek(BITS_PER_FRAME_HEADER as usize) as u32;
+        if frame_size == 0 || frame_size == MAX_FRAME_LENGTH {
+            break;
+        }
+        if stream.offset_bits() == frame_offset as usize {
+            info.current_frame = info.frame_count;
+            info.current_frame_size = frame_size;
+        }
+        info.frame_count += 1;
+        if frame_size as usize > stream.bits_remaining() {
+            // Last frame.
+            break;
+        }
+        stream.advance((frame_size - 1) as usize);
+        // Trailing continuation bit.
+        if stream.read(1) == 0 {
+            break;
+        }
+    }
+
+    if is_packet_xma2(packet) {
+        let xma2_frame_count = packet_frame_count(packet) as u32;
+        if xma2_frame_count > info.frame_count {
+            if info.current_frame_size == 0 {
+                info.current_frame = info.frame_count;
+            }
+            info.frame_count = xma2_frame_count;
+        }
+    }
+    info
+}
+
+/// Packet number for a bit offset (canary `GetPacketNumber`). Returns None when
+/// the offset is in the header or past the buffer.
+pub fn packet_number(size_bytes: u32, bit_offset: u32) -> Option<u32> {
+    if bit_offset < BITS_PER_PACKET_HEADER {
+        return None;
+    }
+    if bit_offset >= size_bytes * 8 {
+        return None;
+    }
+    Some((bit_offset >> 3) / BYTES_PER_PACKET)
+}
+
+/// min(remaining_stream_bits, frame_size) (canary `GetAmountOfBitsToRead`).
+pub fn amount_of_bits_to_read(remaining_stream_bits: u32, frame_size: u32) -> u32 {
+    remaining_stream_bits.min(frame_size)
+}
+
+// ---- Per-context decode state (lives in the XmaDecoder, one per ctx).
+
+#[derive(Default)]
+pub struct ContextDecodeState {
+    /// FFmpeg xma2 codec for this context (lazily created / reconfigured).
+    pub codec: Option<Xma2Codec>,
+    pub codec_rate: u32,
+    pub codec_channels: u32,
+    /// Staged interleaved S16BE PCM for the current decoded frame
+    /// (`raw_frame_`), drained by Consume in 256-byte blocks.
+    pub raw_frame: Vec<u8>,
+    /// Decoded interleaved S16BE PCM not yet split into per-frame `raw_frame`s.
+    /// The mainline xma2 decoder emits bursts of many 512-sample frames at once
+    /// (internal FIFO + 4096-sample lookahead); we queue the bytes here and
+    /// hand the guest exactly one 512-sample frame per `produce_frame`.
+    pub pcm_queue: VecDeque<u8>,
+    pub current_frame_remaining_subframes: u8,
+    pub remaining_subframe_blocks_in_output: i32,
+    /// Total 512-sample frames decoded for this context (diagnostic).
+    pub frames_decoded: u64,
+    /// Whether a "first frame" diagnostic has been emitted.
+    pub first_frame_logged: bool,
+    /// FFmpeg feed cursor: the next packet index (within the *current* input
+    /// buffer at feed time) we will hand to FFmpeg. This is the decoder's
+    /// internal intake position and is intentionally decoupled from the
+    /// guest-visible `input_buffer_read_offset` (which advances per *emitted*
+    /// frame via the faithful packet-walk). We feed ahead so FFmpeg always has
+    /// enough buffered input to satisfy the guest's drain, while the guest sees
+    /// the read offset move at canary's true per-frame cadence.
+    pub feed_packet_index: u32,
+    /// `current_buffer` the feed cursor is reading from; reset on swap so the
+    /// feed follows the same ping-pong as the guest-visible buffer.
+    pub feed_buffer: u32,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// The bitfield unpack/pack must round-trip every decoder-relevant field at
+    /// the exact canary offsets (regression against a shifted bit).
+    #[test]
+    fn context_bitfields_round_trip() {
+        let mut c = XmaContextData::default();
+        c.input_buffer_0_packet_count = 632;
+        c.loop_count = 0;
+        c.input_buffer_0_valid = 1;
+        c.input_buffer_1_valid = 0;
+        c.output_buffer_block_count = 30;
+        c.output_buffer_write_offset = 5;
+        c.subframe_decode_count = 8;
+        c.output_buffer_padding = 1;
+        c.sample_rate = 3;
+        c.is_stereo = 1;
+        c.output_buffer_valid = 1;
+        c.input_buffer_read_offset = 16416;
+        c.error_status = 4;
+        c.current_buffer = 1;
+        c.input_buffer_0_ptr = 0x0b9f_d000;
+        c.output_buffer_ptr = 0x01f6_6e00;
+        c.output_buffer_read_offset = 7;
+        c.interrupt_when_done = 1;
+
+        // pack → words → re-read via the same word layout.
+        let d = c.pack();
+        // Simulate read() decode from the packed words.
+        let mut c2 = XmaContextData::default();
+        c2.input_buffer_0_packet_count = bits(d[0], 0, 12);
+        c2.input_buffer_0_valid = bits(d[0], 20, 1);
+        c2.output_buffer_block_count = bits(d[0], 22, 5);
+        c2.output_buffer_write_offset = bits(d[0], 27, 5);
+        c2.subframe_decode_count = bits(d[1], 20, 4);
+        c2.output_buffer_padding = bits(d[1], 24, 3);
+        c2.sample_rate = bits(d[1], 27, 2);
+        c2.is_stereo = bits(d[1], 29, 1);
+        c2.output_buffer_valid = bits(d[1], 31, 1);
+        c2.input_buffer_read_offset = bits(d[2], 0, 26);
+        c2.error_status = bits(d[2], 26, 5);
+        c2.current_buffer = bits(d[4], 31, 1);
+        c2.output_buffer_read_offset = bits(d[9], 0, 5);
+        c2.interrupt_when_done = bits(d[9], 31, 1);
+
+        assert_eq!(c2.input_buffer_0_packet_count, 632);
+        assert_eq!(c2.input_buffer_0_valid, 1);
+        assert_eq!(c2.output_buffer_block_count, 30);
+        assert_eq!(c2.output_buffer_write_offset, 5);
+        assert_eq!(c2.subframe_decode_count, 8);
+        assert_eq!(c2.output_buffer_padding, 1);
+        assert_eq!(c2.sample_rate, 3);
+        assert_eq!(c2.is_stereo, 1);
+        assert_eq!(c2.output_buffer_valid, 1);
+        assert_eq!(c2.input_buffer_read_offset, 16416);
+        assert_eq!(c2.error_status, 4);
+        assert_eq!(c2.current_buffer, 1);
+        assert_eq!(c2.output_buffer_read_offset, 7);
+        assert_eq!(c2.interrupt_when_done, 1);
+    }
+
+    #[test]
+    fn phys_to_backing_projects_physical_window() {
+        assert_eq!(xma_phys_to_backing(0x0b9f_d000), 0x4b9f_d000);
+        assert_eq!(xma_phys_to_backing(0x01f6_6e00), 0x41f6_6e00);
+    }
+
+    #[test]
+    fn ring_write_count_matches_canary() {
+        // empty (read==write) → full capacity.
+        assert_eq!(ring_write_count(0, 0, 7680), 7680);
+        // write ahead of read.
+        assert_eq!(ring_write_count(0, 256, 7680), 7680 - 256);
+        // write wrapped behind read.
+        assert_eq!(ring_write_count(512, 256, 7680), 256);
+    }
+
+    #[test]
+    fn packet_header_helpers() {
+        // Matches the observed first packet word 0x08000000: byte0=0x08.
+        let pkt = [0x08u8, 0x00, 0x00, 0x00];
+        assert_eq!(packet_frame_count(&pkt), 2); // 0x08>>2 = 2
+        // frame offset: ((0x08&3)<<13 | 0<<5 | 0x00>>3) + 32 = 32.
+        assert_eq!(packet_frame_offset(&pkt), 32);
+        // A non-zero byte2 shifts the offset: 0x08>>3 = 1 → +1.
+        let pkt2 = [0x08u8, 0x00, 0x08, 0x00];
+        assert_eq!(packet_frame_offset(&pkt2), 33);
+    }
+}
+
+impl ContextDecodeState {
+    pub fn new() -> Self {
+        Self {
+            codec: None,
+            codec_rate: 0,
+            codec_channels: 0,
+            raw_frame: vec![0u8; (BYTES_PER_FRAME_CHANNEL * 2) as usize],
+            pcm_queue: VecDeque::new(),
+            current_frame_remaining_subframes: 0,
+            remaining_subframe_blocks_in_output: 0,
+            frames_decoded: 0,
+            first_frame_logged: false,
+            feed_packet_index: 0,
+            feed_buffer: 0,
+        }
+    }
+}
--- a/crates/xenia-cpu/src/dispatch_rec.rs
+++ b/crates/xenia-cpu/src/dispatch_rec.rs
@@ -0,0 +1,217 @@
+//! Runtime indirect-dispatch recorder.
+//!
+//! A reusable, env-gated facility that captures every indirect call performed
+//! through CTR (`bcctr`/`bcctrl`/`bctr`) as a unique `(call_site_pc ->
+//! target_pc)` pair, together with the object register `r3` seen at the call
+//! and a hit count. It exists to provide GROUND-TRUTH indirect-dispatch
+//! resolution for reverse-engineering vtable dispatch that the static
+//! analyzer fails to resolve (e.g. the Sylpheed movie engine vtable
+//! `0x8200a908`).
+//!
+//! ## Gating & overhead
+//! Recording is OFF by default. It is enabled only when the environment
+//! variable `XENIA_DISPATCH_REC` is set to a non-empty, non-`0` value at
+//! process start. When OFF, [`record`] is a single relaxed atomic-bool load
+//! followed by an early return — no allocation, no locking, no behavior
+//! change. The recorder is pure: it never reads the clock, never touches
+//! scheduling, and never mutates guest/CPU state, so enabling it does not
+//! perturb deterministic runs (only adds a HashMap insert behind a mutex).
+//!
+//! ## Focus filters (optional)
+//! Two env vars narrow what is recorded (both default to "record everything"):
+//! - `XENIA_DISPATCH_REC_TARGETS=0x82505c08,...` — only edges whose resolved
+//!   target is in the list. Answers "who calls `<target>`": every recorded
+//!   edge then carries the caller `site` and `lr`.
+//! - `XENIA_DISPATCH_REC_SITES=0x825078d8,...` — only edges from the listed
+//!   call-site PCs.
+//! When both are set, an edge must satisfy BOTH. These keep a long focused
+//! run (e.g. the intro-movie trace) producing a small, relevant table instead
+//! of the whole program-wide dispatch set. Pure observe-only — filtering only
+//! affects which edges are stored, never guest/CPU state.
+//!
+//! ## Output
+//! On [`dump`] (call at end-of-run) the table is written to the path in
+//! `XENIA_DISPATCH_REC_OUT` (default `/tmp/dispatch_rec.txt`), sorted by
+//! descending hit count, one record per line:
+//! `callsite_pc target_pc count r3=<obj>` (all hex).
+
+use std::collections::HashMap;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Mutex;
+use std::sync::OnceLock;
+
+/// Enabled flag, resolved once from the environment at first touch.
+static ENABLED: OnceLock<bool> = OnceLock::new();
+/// Fast-path mirror of `ENABLED` so the hot path is a single relaxed load
+/// (avoids the `OnceLock` get + deref on every indirect branch when OFF).
+static ENABLED_FAST: AtomicBool = AtomicBool::new(false);
+
+/// One observed indirect-dispatch edge.
+#[derive(Default, Clone, Copy)]
+struct Edge {
+    count: u64,
+    /// Last-seen object register (`r3`) at this (site,target) edge. Stable for
+    /// a vtable dispatch where the same call site always dispatches on the
+    /// same kind of object.
+    last_r3: u64,
+    /// Last-seen link register (return address) for the call.
+    last_lr: u64,
+}
+
+/// (call_site_pc, target_pc) -> Edge
+static TABLE: OnceLock<Mutex<HashMap<(u32, u32), Edge>>> = OnceLock::new();
+
+/// Optional focus filters, resolved once from the environment. When either is
+/// non-empty, an edge is recorded only if its `target` is in `TARGET_FILTER`
+/// (when that set is non-empty) AND its `site` is in `SITE_FILTER` (when that
+/// set is non-empty). Empty sets mean "no constraint on that axis". This lets
+/// a long focused run (e.g. the intro-movie trace) record ONLY the dispatch
+/// edges relevant to a target-set under investigation — for example "every
+/// indirect call whose target is the XMV submit `sub_82505C08`", which answers
+/// the milestone-2 "who calls submit on the engine" question with the caller
+/// `lr` — instead of the whole (large) program-wide dispatch table.
+static TARGET_FILTER: OnceLock<Vec<u32>> = OnceLock::new();
+static SITE_FILTER: OnceLock<Vec<u32>> = OnceLock::new();
+
+/// Parse a comma-separated list of hex PCs (`0x` prefix optional) into a
+/// sorted, deduped Vec. Empty/garbage tokens are skipped.
+fn parse_pc_list_str(s: &str) -> Vec<u32> {
+    let mut v: Vec<u32> = s
+        .split(',')
+        .map(str::trim)
+        .filter(|t| !t.is_empty())
+        .filter_map(|t| {
+            let hex = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")).unwrap_or(t);
+            u32::from_str_radix(hex, 16).ok()
+        })
+        .collect();
+    v.sort_unstable();
+    v.dedup();
+    v
+}
+
+/// Parse a PC list from an env var. Missing var → empty Vec (no constraint).
+fn parse_pc_list(var: &str) -> Vec<u32> {
+    match std::env::var(var) {
+        Ok(s) => parse_pc_list_str(&s),
+        Err(_) => Vec::new(),
+    }
+}
+
+/// Resolve the enabled flag (and focus filters) from the environment exactly
+/// once.
+fn init_enabled() -> bool {
+    let on = match std::env::var("XENIA_DISPATCH_REC") {
+        Ok(v) => !v.is_empty() && v != "0",
+        Err(_) => false,
+    };
+    ENABLED_FAST.store(on, Ordering::Relaxed);
+    let _ = TARGET_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_TARGETS"));
+    let _ = SITE_FILTER.set(parse_pc_list("XENIA_DISPATCH_REC_SITES"));
+    on
+}
+
+/// Whether recording is enabled. Cheap after the first call.
+#[inline(always)]
+pub fn enabled() -> bool {
+    // Hot path: relaxed atomic load. ENABLED_FAST is initialised by the first
+    // call to `enabled_init` (below); until then it is `false`, which is also
+    // the correct default. We force initialisation eagerly from `install`.
+    ENABLED_FAST.load(Ordering::Relaxed)
+}
+
+/// Force the env resolution (call once early in startup). Idempotent.
+pub fn install() {
+    let _ = ENABLED.get_or_init(init_enabled);
+}
+
+/// Record one indirect (CTR) call edge. No-op when disabled.
+///
+/// `site` = PC of the `bcctr`/`bctr` instruction, `target` = resolved CTR
+/// target, `r3` = object register at the call, `lr` = link register.
+#[inline(always)]
+pub fn record(site: u32, target: u32, r3: u64, lr: u64) {
+    // Single predictable branch when OFF.
+    if !ENABLED_FAST.load(Ordering::Relaxed) {
+        return;
+    }
+    // Focus filters (only consulted when recording is ON, i.e. rare). An empty
+    // filter set imposes no constraint on its axis.
+    if let Some(targets) = TARGET_FILTER.get()
+        && !targets.is_empty()
+        && targets.binary_search(&target).is_err()
+    {
+        return;
+    }
+    if let Some(sites) = SITE_FILTER.get()
+        && !sites.is_empty()
+        && sites.binary_search(&site).is_err()
+    {
+        return;
+    }
+    let table = TABLE.get_or_init(|| Mutex::new(HashMap::new()));
+    if let Ok(mut t) = table.lock() {
+        let e = t.entry((site, target)).or_default();
+        e.count += 1;
+        e.last_r3 = r3;
+        e.last_lr = lr;
+    }
+}
+
+/// Dump the recorded table to the output file. No-op when disabled or empty.
+pub fn dump() {
+    if !enabled() {
+        return;
+    }
+    let path = std::env::var("XENIA_DISPATCH_REC_OUT")
+        .unwrap_or_else(|_| "/tmp/dispatch_rec.txt".to_string());
+    let table = match TABLE.get() {
+        Some(t) => t,
+        None => return,
+    };
+    let guard = match table.lock() {
+        Ok(g) => g,
+        Err(_) => return,
+    };
+    let mut rows: Vec<((u32, u32), Edge)> =
+        guard.iter().map(|(k, v)| (*k, *v)).collect();
+    // Deterministic order: count desc, then site, then target.
+    rows.sort_by(|a, b| {
+        b.1.count
+            .cmp(&a.1.count)
+            .then(a.0 .0.cmp(&b.0 .0))
+            .then(a.0 .1.cmp(&b.0 .1))
+    });
+    let mut out = String::with_capacity(rows.len() * 48);
+    out.push_str("# callsite_pc target_pc count r3 lr\n");
+    for ((site, target), e) in rows {
+        out.push_str(&format!(
+            "{:#010x} {:#010x} {} r3={:#018x} lr={:#018x}\n",
+            site, target, e.count, e.last_r3, e.last_lr
+        ));
+    }
+    if let Err(err) = std::fs::write(&path, out) {
+        eprintln!("dispatch_rec: failed to write {}: {}", path, err);
+    } else {
+        eprintln!("dispatch_rec: wrote {} edges to {}", guard.len(), path);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::parse_pc_list_str;
+
+    #[test]
+    fn parse_pc_list_handles_prefixes_whitespace_and_dedup() {
+        // Mixed 0x / bare hex, surrounding whitespace, an empty token, and a
+        // duplicate. Result is sorted + deduped; garbage tokens are dropped.
+        let got = parse_pc_list_str(" 0x82505c08 , 825078d8,, 82505c08 , zzz ");
+        assert_eq!(got, vec![0x82505c08, 0x825078d8]);
+    }
+
+    #[test]
+    fn parse_pc_list_empty_is_no_constraint() {
+        assert!(parse_pc_list_str("").is_empty());
+        assert!(parse_pc_list_str("  ,  , ").is_empty());
+    }
+}
--- a/crates/xenia-cpu/src/interpreter.rs
+++ b/crates/xenia-cpu/src/interpreter.rs
@@ -1012,7 +1012,13 @@ fn execute(ctx: &mut PpcContext, mem: &dyn MemoryAccess, instr: &DecodedInstr) -

            if cond_ok {
                let next_pc = ctx.pc + 4;
-                ctx.pc = (ctx.ctr as u32) & !3;
+                let target = (ctx.ctr as u32) & !3;
+                // Ground-truth indirect-dispatch recording (env-gated, off by
+                // default; pure record-only, no scheduling/state change).
+                if crate::dispatch_rec::enabled() {
+                    crate::dispatch_rec::record(ctx.pc, target, ctx.gpr[3], ctx.lr);
+                }
+                ctx.pc = target;
                if instr.lk() {
                    ctx.lr = next_pc as u64;
                }
--- a/crates/xenia-cpu/src/lib.rs
+++ b/crates/xenia-cpu/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod block_cache;
 pub mod context;
 pub mod decoder;
+pub mod dispatch_rec;
 pub mod disasm;
 pub mod fpscr;
 pub mod interpreter;
--- a/crates/xenia-cpu/src/scheduler.rs
+++ b/crates/xenia-cpu/src/scheduler.rs
@@ -205,6 +205,21 @@ pub enum BlockReason {
    CriticalSection(u32),
 }

+/// Floor of the **synthetic park-handle** range. Handles at or above this
+/// value are deliberately OUTSIDE the kernel object allocator (which starts
+/// at `0x1000`); they are used to park threads that must NEVER be woken by
+/// the normal signal/wait machinery — currently the dedicated audio-worker
+/// threads (`xenia_kernel::xaudio::XAUDIO_SYNTHETIC_HANDLE_BASE = 0xF000_0000`),
+/// which are only ever un-parked by audio-callback injection. The deadlock
+/// force-wake ([`Scheduler::unblock_on_deadlock`]) must skip waiters parked
+/// solely on such handles: they are not deadlock participants (the guest
+/// genuinely blocked on its own objects), and waking one runs its thread
+/// entry to the `LR_HALT` sentinel → premature exit, which then drops every
+/// subsequent injection. Kept in `xenia-cpu` (not imported from
+/// `xenia-kernel`, which depends on this crate); the kernel const must stay
+/// within `[SYNTHETIC_PARK_HANDLE_FLOOR, u32::MAX]`.
+pub const SYNTHETIC_PARK_HANDLE_FLOOR: u32 = 0xF000_0000;
+
 /// Sink for PCR+0x2C writes — the scheduler writes the guest-visible
 /// current-processor-id here at spawn and Axis 4 rewrites on affinity
 /// migration. Implemented by `xenia-kernel` for `GuestMemory`; keeping it
@@ -1399,6 +1414,27 @@ impl Scheduler {
        let mut woken = Vec::new();
        for (hw_id, slot) in self.slots.iter_mut().enumerate() {
            for (idx, t) in slot.runqueue.iter_mut().enumerate() {
+                // Skip threads parked SOLELY on synthetic park-handles
+                // (audio workers). They are not deadlock participants — the
+                // guest blocked on its own objects — and waking one runs its
+                // thread entry to the LR_HALT sentinel, exiting it and
+                // dropping every subsequent audio-callback injection. Only
+                // audio-callback injection may un-park them. A wait whose
+                // handle set mixes synthetic and real handles is still
+                // eligible (the real handle makes it a genuine waiter).
+                let synthetic_park = match &t.state {
+                    HwState::Blocked(BlockReason::WaitAny { handles, .. })
+                    | HwState::Blocked(BlockReason::WaitAll { handles, .. }) => {
+                        !handles.is_empty()
+                            && handles
+                                .iter()
+                                .all(|&h| h >= SYNTHETIC_PARK_HANDLE_FLOOR)
+                    }
+                    _ => false,
+                };
+                if synthetic_park {
+                    continue;
+                }
                if matches!(
                    t.state,
                    HwState::Blocked(BlockReason::WaitAny { .. })
@@ -1485,6 +1521,41 @@ mod tests {
        }
    }

+    #[test]
+    fn unblock_on_deadlock_skips_synthetic_park_waiters() {
+        // The audio worker parks on a synthetic handle (>= FLOOR) and must
+        // survive the deadlock force-wake; a peer parked on a real handle
+        // must be woken. Regression for the milestone-2 stall where the
+        // force-wake destroyed the audio worker → all callbacks dropped.
+        let mut s = mk_scheduler_with_initial();
+        s.spawn(worker_spawn_params(2, 0x2000), &mut NullPcr).unwrap();
+        s.spawn(worker_spawn_params(3, 0x2010), &mut NullPcr).unwrap();
+        let audio = ThreadRef { hw_id: 1, idx: 0, generation: 0 };
+        let real = ThreadRef { hw_id: 2, idx: 0, generation: 0 };
+        s.thread_mut(audio).state = HwState::Blocked(BlockReason::WaitAny {
+            handles: vec![SYNTHETIC_PARK_HANDLE_FLOOR],
+            deadline: None,
+        });
+        s.thread_mut(real).state = HwState::Blocked(BlockReason::WaitAny {
+            handles: vec![0x1234],
+            deadline: None,
+        });
+        let woken = s.unblock_on_deadlock();
+        assert!(
+            woken.contains(&real),
+            "real-handle waiter must be force-woken"
+        );
+        assert!(
+            !woken.contains(&audio),
+            "synthetic-park audio worker must NOT be force-woken"
+        );
+        assert!(matches!(
+            s.thread(audio).state,
+            HwState::Blocked(BlockReason::WaitAny { .. })
+        ));
+        assert_eq!(s.thread(real).state, HwState::Ready);
+    }
+
    // ---- preserved from pre-Axis-1 (updated names and params) ----

    #[test]
--- a/crates/xenia-kernel/Cargo.toml
+++ b/crates/xenia-kernel/Cargo.toml
@@ -11,6 +11,7 @@ xenia-cpu = { workspace = true }
 xenia-vfs = { workspace = true }
 xenia-hid = { workspace = true }
 xenia-gpu = { workspace = true }
+xenia-apu = { workspace = true }
 tracing = { workspace = true }
 metrics = { workspace = true }
 thiserror = { workspace = true }
--- a/crates/xenia-kernel/src/exports.rs
+++ b/crates/xenia-kernel/src/exports.rs
@@ -182,7 +182,7 @@ pub fn register_exports(state: &mut KernelState) {
    state.register_export(Xboxkrnl, 0x01F7, "XAudioGetVoiceCategoryVolumeChangeMask", stub_return_zero);
    state.register_export(Xboxkrnl, 0x01F8, "XAudioGetVoiceCategoryVolume", stub_success);
    state.register_export(Xboxkrnl, 0x0224, "XMACreateContext", xma_create_context);
-    state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", stub_success);
+    state.register_export(Xboxkrnl, 0x0226, "XMAReleaseContext", xma_release_context);

    // Crypto
    state.register_export(Xboxkrnl, 0x0192, "XeCryptSha", stub_success);
@@ -3398,6 +3398,7 @@ fn xaudio_register_render_driver(ctx: &mut PpcContext, mem: &GuestMemory, state:
        callback_pc,
        callback_arg,
        wrapped_callback_arg: wrapped,
+        submitted_frames: 0,
    };
    let Some(index) = state.xaudio.register(client) else {
        tracing::warn!("XAudioRegisterRenderDriverClient: client table full");
@@ -3506,18 +3507,75 @@ fn xaudio_unregister_render_driver(ctx: &mut PpcContext, _mem: &GuestMemory, sta
    ctx.gpr[3] = 0;
 }

+/// Mirrors canary `XAudioSubmitRenderDriverFrame_entry` →
+/// `AudioSystem::SubmitFrame(driver_ptr & 0xFFFF, samples)`:
+/// the guest render-driver mixer (`sub_824DC350`) calls this once per audio
+/// frame with `r3 = driver_id` (`0x4155_xxxx`) and `r4 = sample buffer`.
+/// Canary forwards `samples` to the client's `AudioDriver`; the driver's
+/// playback-completion callback later releases the client semaphore, which is
+/// the buffer-consumed pacing our XAudio callback ticker
+/// (`tick_instr` + `try_inject_audio_callback`) already drives. SubmitFrame
+/// returns void and the caller discards r3 / reads no field SubmitFrame
+/// writes, so faithfully we validate the client index and account the frame
+/// (observational; never read back by the guest). Always returns
+/// `X_ERROR_SUCCESS`, matching canary. Deterministic: only this guest-driven
+/// export mutates state; no wall-clock, no host thread.
 fn xaudio_submit_render_driver_frame(
    ctx: &mut PpcContext,
    _mem: &GuestMemory,
-    _state: &mut KernelState,
+    state: &mut KernelState,
 ) {
+    let driver_id = ctx.gpr[3] as u32;
+    let index = (driver_id & XAUDIO_DRIVER_INDEX_MASK) as usize;
+    let registered = state.xaudio.record_submit(index);
+    if !registered {
+        // Canary logs and submits silence to keep the callback chain alive
+        // for an unregistered/invalid index; our ticker keeps the chain
+        // alive independently, so a debug log suffices.
+        tracing::debug!(
+            driver_id = format_args!("{driver_id:#010x}"),
+            index,
+            "XAudioSubmitRenderDriverFrame: unregistered client index"
+        );
+    } else if state.xaudio.submitted_frames(index) == 1 {
+        tracing::info!(
+            driver_id = format_args!("{driver_id:#010x}"),
+            index,
+            "XAudioSubmitRenderDriverFrame: first frame submitted by guest mixer"
+        );
+    }
    ctx.gpr[3] = 0;
 }

-fn xma_create_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
-    let handle = state.alloc_handle();
-    tracing::info!("XMACreateContext: handle={:#x}", handle);
-    ctx.gpr[3] = handle as u64;
+/// Mirrors xenia-canary `XMACreateContext_entry(lpdword_t context_out_ptr)`:
+/// allocate a context from the register-mapped array, write its guest pointer
+/// to `*context_out_ptr`, and return `X_STATUS_SUCCESS` (or `X_STATUS_NO_MEMORY`
+/// when the 320-slot array is exhausted).
+fn xma_create_context(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelState) {
+    let out_ptr = ctx.gpr[3] as u32;
+    let context_ptr = state.xma.lock().unwrap().allocate_context();
+    if out_ptr != 0 {
+        mem.write_u32(out_ptr, context_ptr);
+    }
+    tracing::info!(
+        out_ptr = format_args!("{out_ptr:#010x}"),
+        context_ptr = format_args!("{context_ptr:#010x}"),
+        "XMACreateContext"
+    );
+    ctx.gpr[3] = if context_ptr == 0 {
+        0xC000_0017 // X_STATUS_NO_MEMORY
+    } else {
+        0 // X_STATUS_SUCCESS
+    };
+}
+
+/// Mirrors xenia-canary `XMAReleaseContext_entry(lpvoid_t context_ptr)`:
+/// free the context slot and return 0.
+fn xma_release_context(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
+    let context_ptr = ctx.gpr[3] as u32;
+    state.xma.lock().unwrap().release_context(context_ptr);
+    tracing::info!(context_ptr = format_args!("{context_ptr:#010x}"), "XMAReleaseContext");
+    ctx.gpr[3] = 0;
 }

 // ===== Xex =====
@@ -4413,7 +4471,8 @@ fn nt_yield_execution(ctx: &mut PpcContext, _mem: &GuestMemory, _state: &mut Ker
 }

 fn ke_resume_thread(ctx: &mut PpcContext, _mem: &GuestMemory, state: &mut KernelState) {
-    let handle = resolve_pseudo_handle(state, ctx.gpr[3] as u32);
+    let raw = ctx.gpr[3] as u32;
+    let handle = resolve_pseudo_handle(state, raw);
    match state.scheduler.find_by_handle(handle) {
        Some(r) => {
            state.scheduler.resume_ref(r);
@@ -4429,14 +4488,19 @@ fn nt_resume_thread(ctx: &mut PpcContext, mem: &GuestMemory, state: &mut KernelS
    // r3 = handle, r4 = prev_suspend_count_ptr
    let handle = ctx.gpr[3] as u32;
    let prev_ptr = ctx.gpr[4] as u32;
-    let prev = state
-        .scheduler
-        .find_by_handle(handle)
-        .map(|r| state.scheduler.resume_ref(r))
-        .unwrap_or(0);
+    match state.scheduler.find_by_handle(handle) {
+        Some(r) => {
+            let prev = state.scheduler.resume_ref(r);
            if prev_ptr != 0 {
                mem.write_u32(prev_ptr, prev);
            }
+        }
+        None => {
+            if prev_ptr != 0 {
+                mem.write_u32(prev_ptr, 0);
+            }
+        }
+    }
    ctx.gpr[3] = STATUS_SUCCESS;
 }

--- a/crates/xenia-kernel/src/state.rs
+++ b/crates/xenia-kernel/src/state.rs
@@ -161,6 +161,11 @@ pub struct KernelState {
    /// graphics interrupts is enforced by the injector's
    /// `is_in_callback()` guard.
    pub xaudio: crate::xaudio::XAudioState,
+    /// Register-mapped XMA context array (apu stage 1). Shared with the
+    /// `0x7FEA0000` MMIO region installed by the app and with the
+    /// `XMACreateContext`/`XMAReleaseContext` exports, so it lives behind an
+    /// `Arc<Mutex<…>>`. Stage 1 records kicks; stage 3 will decode them.
+    pub xma: std::sync::Arc<std::sync::Mutex<xenia_apu::XmaDecoder>>,
    /// AUDIT-032 Plan B (default true). When true, the round prologue
    /// runs the XAudio ticker + `try_inject_audio_callback`. Pre-fix this
    /// was off by default because injection used random-victim selection
@@ -449,6 +454,9 @@ impl KernelState {
            ui: None,
            interrupts: crate::interrupts::InterruptState::default(),
            xaudio: crate::xaudio::XAudioState::default(),
+            // apu stage 1 — un-initialized until the app reserves the context
+            // array and calls `xma.lock().init(va, phys)`.
+            xma: std::sync::Arc::new(std::sync::Mutex::new(xenia_apu::XmaDecoder::new())),
            // AUDIT-032: dedicated audio worker per client (Plan B in
            // `xaudio_register_render_driver`) — not victim hijack, so safe
            // to enable by default. Previously gated off because the
--- a/crates/xenia-kernel/src/xaudio.rs
+++ b/crates/xenia-kernel/src/xaudio.rs
@@ -35,6 +35,14 @@ pub const XAUDIO_MAX_CLIENTS: usize = 8;
 /// no-op anyway).
 pub const XAUDIO_SYNTHETIC_HANDLE_BASE: u32 = 0xF000_0000;

+/// The scheduler's deadlock force-wake skips waiters parked solely on
+/// handles at/above [`xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR`]
+/// so it never destroys a parked audio worker. Keep these in lockstep:
+/// every `synthetic_park_handle` must fall inside that protected range.
+const _: () = assert!(
+    XAUDIO_SYNTHETIC_HANDLE_BASE >= xenia_cpu::scheduler::SYNTHETIC_PARK_HANDLE_FLOOR
+);
+
 /// Compute the synthetic park-handle for client slot `i`.
 pub const fn synthetic_park_handle(i: usize) -> u32 {
    XAUDIO_SYNTHETIC_HANDLE_BASE | (i as u32)
@@ -68,6 +76,16 @@ pub struct XAudioClient {
    /// [audio_system.cc:225-228](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L225-L228)
    /// + [audio_system.cc:139-141](../../../../xenia-canary/src/xenia/apu/audio_system.cc#L139-L141).
    pub wrapped_callback_arg: u32,
+    /// Count of frames the guest has handed us via
+    /// `XAudioSubmitRenderDriverFrame` for this client. Canary's
+    /// `AudioSystem::SubmitFrame` forwards the sample buffer to the client's
+    /// driver, whose playback completion later releases the client semaphore
+    /// — the pacing our callback ticker emulates. The guest mixer
+    /// (`sub_824DC350`) discards SubmitFrame's return and reads no field it
+    /// writes, so this counter is purely observational (logging / liveness),
+    /// never read back by the guest. Deterministic: incremented only inside
+    /// the guest-driven export call.
+    pub submitted_frames: u64,
 }

 #[derive(Debug)]
@@ -138,6 +156,35 @@ impl XAudioState {
        self.clients.get(index).copied().flatten()
    }

+    /// Faithful counterpart to canary `AudioSystem::SubmitFrame`: the guest
+    /// driver client `index` handed us one frame of samples. Canary forwards
+    /// `samples` to the client's `AudioDriver`, whose playback-completion
+    /// callback later releases the client semaphore — the buffer-consumed
+    /// pacing our [`tick_instr`]/[`try_inject_audio_callback`] path already
+    /// emulates. SubmitFrame itself returns void and the guest mixer
+    /// (`sub_824DC350`) reads no field from it, so all we faithfully need to
+    /// do is validate the client and account the frame. Returns `true` iff
+    /// `index` is a registered client (canary submits silence / warns
+    /// otherwise). Deterministic — only the guest-driven export mutates this.
+    pub fn record_submit(&mut self, index: usize) -> bool {
+        match self.clients.get_mut(index) {
+            Some(Some(c)) => {
+                c.submitted_frames = c.submitted_frames.saturating_add(1);
+                true
+            }
+            _ => false,
+        }
+    }
+
+    pub fn submitted_frames(&self, index: usize) -> u64 {
+        self.clients
+            .get(index)
+            .copied()
+            .flatten()
+            .map(|c| c.submitted_frames)
+            .unwrap_or(0)
+    }
+
    pub fn any_registered(&self) -> bool {
        self.clients.iter().any(|c| c.is_some())
    }
@@ -230,6 +277,7 @@ mod tests {
            callback_pc: 0x8200_0000 + arg,
            callback_arg: arg,
            wrapped_callback_arg: 0x4000_0000 + arg,
+            submitted_frames: 0,
        }
    }